import requests import pandas as pd from io import StringIO import streamlit as st import os import plotly.express as px import plotly.graph_objects as go import plotly.colors as pc import numpy as np from sklearn.metrics import mean_squared_error from statsmodels.tsa.stattools import acf from statsmodels.graphics.tsaplots import plot_acf import matplotlib.pyplot as plt ##GET ALL FILES FROM GITHUB def load_GitHub(github_token, file_name): url = f'https://raw.githubusercontent.com/margaridamascarenhas/Transparency_Data/main/{file_name}' headers = {'Authorization': f'token {github_token}'} response = requests.get(url, headers=headers) if response.status_code == 200: csv_content = StringIO(response.text) df = pd.read_csv(csv_content) if 'Date' in df.columns: df['Date'] = pd.to_datetime(df['Date']) # Convert 'Date' column to datetime df.set_index('Date', inplace=True) # Set 'Date' column as the index #df.to_csv(file_name) return df else: print(f"Failed to download {file_name}. Status code: {response.status_code}") return None def load_forecast(github_token): predictions_dict = {} for hour in range(24): file_name = f'Predictions_{hour}h.csv' df = load_GitHub(github_token, file_name) if df is not None: predictions_dict[file_name] = df return predictions_dict def convert_European_time(data, time_zone): data.index = pd.to_datetime(data.index, utc=True) data.index = data.index.tz_convert(time_zone) data.index = data.index.tz_localize(None) return data github_token = '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' if github_token: forecast_dict = load_forecast(github_token) historical_forecast=load_GitHub(github_token, 'Historical_forecast.csv') Data_BE=load_GitHub(github_token, 'BE_Elia_Entsoe_UTC.csv') Data_FR=load_GitHub(github_token, 'FR_Entsoe_UTC.csv') Data_NL=load_GitHub(github_token, 'NL_Entsoe_UTC.csv') Data_DE=load_GitHub(github_token, 'DE_Entsoe_UTC.csv') Data_BE=convert_European_time(Data_BE, 'Europe/Brussels') Data_FR=convert_European_time(Data_FR, 'Europe/Paris') Data_NL=convert_European_time(Data_NL, 'Europe/Amsterdam') Data_DE=convert_European_time(Data_DE, 'Europe/Berlin') else: print("Please enter your GitHub Personal Access Token to proceed.") def conformal_predictions(data, target, my_forecast): data['Residuals'] = data[my_forecast] - data[actual_col] data['Hour'] = data.index.hour min_date = data.index.min() for date in data.index.normalize().unique(): if date >= min_date + pd.DateOffset(days=30): start_date = date - pd.DateOffset(days=30) end_date = date calculation_window = data[start_date:end_date-pd.DateOffset(hours=1)] quantiles = calculation_window.groupby('Hour')['Residuals'].quantile(0.8) # Use .loc to safely access and modify data if date in data.index: current_day_data = data.loc[date.strftime('%Y-%m-%d')] for hour in current_day_data['Hour'].unique(): if hour in quantiles.index: hour_quantile = quantiles[hour] idx = (data.index.normalize() == date) & (data.Hour == hour) data.loc[idx, 'Quantile_80'] = hour_quantile data.loc[idx, 'Lower_Interval'] = data.loc[idx, my_forecast] - hour_quantile data.loc[idx, 'Upper_Interval'] = data.loc[idx, my_forecast] + hour_quantile #data.reset_index(inplace=True) return data st.title("Transparency++") countries = { 'Belgium': 'BE', 'Netherlands': 'NL', 'Germany': 'DE', 'France': 'FR', } st.sidebar.header('Filters') selected_country = st.sidebar.selectbox('Select Country', list(countries.keys())) st.write() date_range = st.sidebar.date_input("Select Date Range for Metrics Calculation:", value=(pd.to_datetime("2024-01-01"), pd.to_datetime(pd.Timestamp('today')))) # Ensure the date range provides two dates if len(date_range) == 2: start_date = pd.Timestamp(date_range[0]) end_date = pd.Timestamp(date_range[1]) else: st.error("Please select a valid date range.") st.stop() # Sidebar with radio buttons for different sections section = st.sidebar.radio('Section', ['Data', 'Forecasts', 'Insights']) country_code = countries[selected_country] if country_code == 'BE': data = Data_BE weather_columns = ['Temperature', 'Wind Speed Onshore', 'Wind Speed Offshore'] data['Temperature'] = data['temperature_2m_8'] data['Wind Speed Offshore'] = data['wind_speed_100m_4'] data['Wind Speed Onshore'] = data['wind_speed_100m_8'] elif country_code == 'DE': data = Data_DE weather_columns = ['Temperature', 'Wind Speed'] data['Temperature'] = data['temperature_2m'] data['Wind Speed'] = data['wind_speed_100m'] elif country_code == 'NL': data = Data_NL weather_columns = ['Temperature', 'Wind Speed'] data['Temperature'] = data['temperature_2m'] data['Wind Speed'] = data['wind_speed_100m'] elif country_code == 'FR': data = Data_FR weather_columns = ['Temperature', 'Wind Speed'] data['Temperature'] = data['temperature_2m'] data['Wind Speed'] = data['wind_speed_100m'] def add_feature(df2, df_main): #df_main.index = pd.to_datetime(df_main.index) #df2.index = pd.to_datetime(df2.index) df_combined = df_main.combine_first(df2) last_date_df1 = df_main.index.max() first_date_df2 = df2.index.min() if first_date_df2 == last_date_df1 + pd.Timedelta(hours=1): df_combined = pd.concat([df_main, df2[df2.index > last_date_df1]], axis=0) #df_combined.reset_index(inplace=True) return df_combined #data.index = data.index.tz_localize('UTC') data = data.loc[start_date:end_date] forecast_columns = [ 'Load_entsoe','Load_forecast_entsoe','Wind_onshore_entsoe','Wind_onshore_forecast_entsoe','Wind_offshore_entsoe','Wind_offshore_forecast_entsoe','Solar_entsoe','Solar_forecast_entsoe'] if section == 'Data': st.header("Data") st.write(""" This section allows you to explore and upload your datasets. You can visualize raw data, clean it, and prepare it for analysis. """) st.header('Data Quality') output_text = f"The below percentages are calculated from the selected date range from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}. This interval can be adjusted from the sidebar." st.write(output_text) # Report % of missing values missing_values = data[forecast_columns].isna().mean() * 100 missing_values = missing_values.round(2) installed_capacities = { 'FR': { 'Solar': 17419, 'Wind Offshore': 1483, 'Wind Onshore': 22134}, 'DE': { 'Solar': 73821, 'Wind Offshore': 8386, 'Wind Onshore': 59915}, 'BE': { 'Solar': 8789, 'Wind Offshore': 2262, 'Wind Onshore': 3053}, 'NL': { 'Solar': 22590, 'Wind Offshore': 3220, 'Wind Onshore': 6190}, } if country_code not in installed_capacities: st.error(f"Installed capacities not defined for country code '{country_code}'.") st.stop() # Report % of extreme, impossible values for the selected country capacities = installed_capacities[country_code] extreme_values = {} for col in forecast_columns: if 'Solar_entsoe' in col: extreme_values[col] = ((data[col] < 0) | (data[col] > capacities['Solar'])).mean() * 100 elif 'Solar_forecast_entsoe' in col: extreme_values[col] = ((data[col] < 0) | (data[col] > capacities['Solar'])).mean() * 100 elif 'Wind_onshore_entsoe' in col: extreme_values[col] = ((data[col] < 0) | (data[col] > capacities['Wind Onshore'])).mean() * 100 elif 'Wind_onshore_forecast_entsoe' in col: extreme_values[col] = ((data[col] < 0) | (data[col] > capacities['Wind Onshore'])).mean() * 100 elif 'Wind_offshore_entsoe' in col: extreme_values[col] = ((data[col] < 0) | (data[col] > capacities['Wind Offshore'])).mean() * 100 elif 'Wind_offshore_forecast_entsoe' in col: extreme_values[col] = ((data[col] < 0) | (data[col] > capacities['Wind Offshore'])).mean() * 100 elif 'Load_entsoe' in col: extreme_values[col] = ((data[col] < 0)).mean() * 100 elif 'Load_forecast_entsoe' in col: extreme_values[col] = ((data[col] < 0)).mean() * 100 extreme_values = pd.Series(extreme_values).round(2) # Combine all metrics into one DataFrame metrics_df = pd.DataFrame({ 'Missing Values (%)': missing_values, 'Extreme/Nonsensical Values (%)': extreme_values, }) st.markdown( """ """, unsafe_allow_html=True ) st.dataframe(metrics_df) st.write('Missing values (%): Percentage of missing values in the dataset', unsafe_allow_html=True) st.write('Extreme/Nonsensical values (%): Values that are considered implausible such as negative or out-of-bound values i.e., (generation<0) or (generation>capacity)', unsafe_allow_html=True) # Section 2: Forecasts elif section == 'Forecasts': st.header('Forecast Quality') # Time series for last 1 week st.subheader('Time Series: Last 1 Week') last_week = Data_BE.loc[Data_BE.index >= (data.index[-1] - pd.Timedelta(days=7))] st.write('The below plots show the time series of forecasts vs. observations provided by the ENTSO-E Transparency platform between the selected data range.') forecast_columns_operational = [ 'Load_entsoe','Load_forecast_entsoe', 'Load_LightGBMModel.7D.TimeCov.Temp.Forecast_elia', 'Wind_onshore_entsoe','Wind_onshore_forecast_entsoe','Wind_onshore_LightGBMModel.1D.TimeCov.Temp.Forecast_elia','Wind_offshore_entsoe','Wind_offshore_forecast_entsoe','Wind_offshore_LightGBMModel.1D.TimeCov.Temp.Forecast_elia','Solar_entsoe','Solar_forecast_entsoe', 'Solar_LightGBMModel.1D.TimeCov.Temp.Forecast_elia'] forecast_columns = [ 'Load_entsoe','Load_forecast_entsoe','Wind_onshore_entsoe','Wind_onshore_forecast_entsoe','Wind_offshore_entsoe','Wind_offshore_forecast_entsoe','Solar_entsoe','Solar_forecast_entsoe'] operation_forecast_load=forecast_dict['Predictions_10h.csv'].filter(like='Load_', axis=1) operation_forecast_res=forecast_dict['Predictions_17h.csv'].filter(regex='^(?!Load_)') operation_forecast_load.columns = [col.replace('_entsoe.', '_').replace('Naive.7D', 'WeeklyNaiveSeasonal') for col in operation_forecast_load.columns] operation_forecast_res.columns = [col.replace('_entsoe.', '_').replace('Naive.1D', 'DailyNaiveSeasonal') for col in operation_forecast_res.columns] Historical_and_Load=add_feature(operation_forecast_load, historical_forecast) Historical_and_operational=add_feature(operation_forecast_res, Historical_and_Load) #print(Historical_and_operational.filter(like='Forecast_elia', axis=1)) best_forecast = Historical_and_operational.filter(like='Forecast_elia', axis=1) df_combined = Historical_and_operational.join(Data_BE, how='inner') last_week_best_forecast = best_forecast.loc[best_forecast.index >= (best_forecast.index[-24] - pd.Timedelta(days=8))] for i in range(0, len(forecast_columns_operational), 3): actual_col = forecast_columns_operational[i] forecast_col = forecast_columns_operational[i + 1] my_forecast = forecast_columns_operational[i + 2] if forecast_col in data.columns: fig = go.Figure() fig.add_trace(go.Scatter(x=last_week.index, y=last_week[actual_col], mode='lines', name='Actual')) fig.add_trace(go.Scatter(x=last_week.index, y=last_week[forecast_col], mode='lines', name='Forecast ENTSO-E')) if country_code=='BE': conformal=conformal_predictions(df_combined, actual_col, my_forecast) last_week_conformal = conformal.loc[conformal.index >= (conformal.index[-24] - pd.Timedelta(days=8))] if actual_col =='Load_entsoe': last_week_conformal = conformal.loc[conformal.index >= (conformal.index[-24] - pd.Timedelta(days=5))] fig.add_trace(go.Scatter(x=last_week_best_forecast.index, y=last_week_best_forecast[my_forecast], mode='lines', name='Forecast EDS')) fig.add_trace(go.Scatter( x=last_week_conformal.index, y=last_week_conformal['Lower_Interval'], mode='lines', line=dict(width=0), showlegend=False )) # Add the upper interval trace and fill to the lower interval fig.add_trace(go.Scatter( x=last_week_conformal.index, y=last_week_conformal['Upper_Interval'], mode='lines', line=dict(width=0), fill='tonexty', # Fill between this trace and the previous one fillcolor='rgba(68, 68, 68, 0.3)', name='P10/P90 prediction intervals' )) fig.update_layout(title=f'Forecasts vs Actual for {actual_col}', xaxis_title='Date', yaxis_title='Value [MW]') st.plotly_chart(fig) def plot_category(df_dict, category_prefix, title): fig = go.Figure() # Define base colors for each model model_colors = { 'LightGBMModel.TimeCov.Temp.Forecast_elia': '#1f77b4', # Blue 'LightGBMModel.TimeCov.Temp': '#2ca02c', # Green 'Naive': '#ff7f0e' # Orange } # To keep track of which model has been added to the legend legend_added = {'LightGBMModel.TimeCov.Temp.Forecast_elia': False, 'LightGBMModel.TimeCov.Temp': False, 'Naive': False} for file_name, df in df_dict.items(): # Extract the hour from the filename, assuming the format is "Predictions_Xh.csv" hour = int(file_name.split('_')[1].replace('h.csv', '')) filtered_columns = [col for col in df.columns if col.startswith(category_prefix)] for column in filtered_columns: # Identify the model type with more precise logic if 'LightGBMModel' in column: if 'Forecast_elia' in column: model_key = 'LightGBMModel.TimeCov.Temp.Forecast_elia' elif 'TimeCov' in column: model_key = 'LightGBMModel.TimeCov.Temp' elif 'Naive' in column: model_key = 'Naive' else: continue # Skip if it doesn't match any model type # Extract the relevant part of the model name parts = column.split('.') model_name_parts = parts[1:] # Skip the variable prefix model_name = '.'.join(model_name_parts) # Rejoin the parts to form the model name # Get the base color for the model base_color = model_colors[model_key] # Calculate the color shade based on the hour color_scale = pc.hex_to_rgb(base_color) scale_factor = 0.3 + (hour / 40) # Adjust scale to ensure the gradient is visible adjusted_color = tuple(int(c * scale_factor) for c in color_scale) # Convert to RGBA with transparency for plot lines line_color = f'rgba({adjusted_color[0]}, {adjusted_color[1]}, {adjusted_color[2]}, 0.1)' # Transparent color for lines # Combine the hour and the model name for the legend, but only add the legend entry once show_legend = not legend_added[model_key] fig.add_trace(go.Scatter( x=df.index, # Assuming 'Date' is the index, use 'df.index' for x-axis y=df[column], mode='lines', name=model_name if show_legend else None, # Use the model name for the legend, but only once line=dict(color=base_color if show_legend else line_color), # Use opaque color for legend, transparent for lines showlegend=show_legend, # Show legend only once per model legendgroup=model_key # Grouping for consistent legend color )) # Mark that this model has been added to the legend if show_legend: legend_added[model_key] = True # Add real values as a separate trace, if provided filtered_Data_BE_df = Data_BE.loc[df.index] if filtered_Data_BE_df[f'{category_prefix}_entsoe'].notna().any(): fig.add_trace(go.Scatter( x=filtered_Data_BE_df.index, y=filtered_Data_BE_df[f'{category_prefix}_entsoe'], mode='lines', name=f'Actual {category_prefix}', line=dict(color='black', width=2), # Black line for real values showlegend=True # Always show this in the legend )) # Update layout to position the legend at the top, side by side fig.update_layout( title=dict( text=title, x=0, # Center the title horizontally y=1.00, # Slightly lower the title to create more space xanchor='left', yanchor='top' ), xaxis_title='Date', yaxis_title='Value', legend=dict( orientation="h", # Horizontal legend yanchor="bottom", # Align to the bottom of the legend box y=1, # Increase y position to avoid overlap with the title xanchor="center", # Center the legend horizontally x=0.5 # Position at the center of the plot ) ) return fig if country_code == "BE": st.header('EDS Forecasts by Hour') solar_fig = plot_category(forecast_dict, 'Solar', 'Solar Predictions') st.plotly_chart(solar_fig) wind_offshore_fig = plot_category(forecast_dict, 'Wind_offshore', 'Wind Offshore Predictions') st.plotly_chart(wind_offshore_fig) wind_onshore_fig = plot_category(forecast_dict, 'Wind_onshore', 'Wind Onshore Predictions') st.plotly_chart(wind_onshore_fig) load_fig = plot_category(forecast_dict, 'Load', 'Load Predictions') st.plotly_chart(load_fig) # Scatter plots for error distribution st.subheader('Error Distribution') st.write('The below scatter plots show the error distribution of all three fields: Solar, Wind and Load between the selected date range') for i in range(0, len(forecast_columns), 2): actual_col = forecast_columns[i] forecast_col = forecast_columns[i + 1] if forecast_col in data.columns: obs = last_week[actual_col] pred = last_week[forecast_col] error = pred - obs fig = px.scatter(x=obs, y=pred, labels={'x': 'Observed [MW]', 'y': 'Predicted by ENTSO-E [MW]'}) fig.update_layout(title=f'Error Distribution for {forecast_col}') st.plotly_chart(fig) st.subheader('Accuracy Metrics (Sorted by rMAE):') if country_code == "BE": # Combine the two DataFrames on their index df_combined = Historical_and_operational.join(Data_BE, how='inner') # List of model columns from historical_forecast model_columns = historical_forecast.columns # Initialize dictionaries to store MAE and RMSE results for each variable results_wind_onshore = {} results_wind_offshore = {} results_load = {} results_solar = {} # Mapping of variables to their corresponding naive models naive_models = { 'Wind_onshore': 'Wind_onshore_DailyNaiveSeasonal', 'Wind_offshore': 'Wind_offshore_DailyNaiveSeasonal', 'Load': 'Load_WeeklyNaiveSeasonal', 'Solar': 'Solar_DailyNaiveSeasonal' } # Step 1: Calculate MAE, RMSE, and rMAE for each model for col in model_columns: # Extract the variable name by taking everything before the first underscore base_variable = col.split('_')[0] # Handle cases where variable names might be combined with multiple parts (e.g., "Load_LightGBMModel...") if base_variable in ['Wind', 'Load', 'Solar']: if 'onshore' in col: variable_name = 'Wind_onshore' results_dict = results_wind_onshore elif 'offshore' in col: variable_name = 'Wind_offshore' results_dict = results_wind_offshore else: variable_name = base_variable results_dict = results_load if base_variable == 'Load' else results_solar else: variable_name = base_variable # Construct the corresponding `variable_entsoe` column name entsoe_column = f'{variable_name}_entsoe' naive_model_col = naive_models.get(variable_name, None) # Drop NaNs for the specific pair of columns before calculating MAE and RMSE if entsoe_column in df_combined.columns and naive_model_col in df_combined.columns: valid_data = df_combined[[col, entsoe_column]].dropna() valid_naive_data = df_combined[[entsoe_column, naive_model_col]].dropna() # Calculate MAE and RMSE for the model against the `variable_entsoe` mae = np.mean(abs(valid_data[col] - valid_data[entsoe_column])) rmse = np.sqrt(mean_squared_error(valid_data[col], valid_data[entsoe_column])) # Calculate MAE for the Naive model mae_naive = np.mean(abs(valid_naive_data[entsoe_column] - valid_naive_data[naive_model_col])) # Calculate rMAE for the model rMAE = mae / mae_naive if mae_naive != 0 else np.inf # Store the results in the corresponding dictionary results_dict[f'{col}'] = {'MAE': mae, 'RMSE': rmse, 'rMAE': rMAE} # Step 2: Calculate MAE, RMSE, and rMAE for ENTSO-E forecasts specifically for variable_name in naive_models.keys(): entsoe_column = f'{variable_name}_entsoe' forecast_entsoe_column = f'{variable_name}_forecast_entsoe' naive_model_col = naive_models[variable_name] # Ensure that the ENTSO-E forecast is included in the results if forecast_entsoe_column in df_combined.columns: valid_data = df_combined[[forecast_entsoe_column, entsoe_column]].dropna() valid_naive_data = df_combined[[entsoe_column, naive_model_col]].dropna() # Calculate MAE and RMSE for the ENTSO-E forecast against the actuals mae_entsoe = np.mean(abs(valid_data[forecast_entsoe_column] - valid_data[entsoe_column])) rmse_entsoe = np.sqrt(mean_squared_error(valid_data[forecast_entsoe_column], valid_data[entsoe_column])) # Calculate rMAE for the ENTSO-E forecast mae_naive = np.mean(abs(valid_naive_data[entsoe_column] - valid_naive_data[naive_model_col])) rMAE_entsoe = mae_entsoe / mae_naive if mae_naive != 0 else np.inf # Add the ENTSO-E results to the corresponding dictionary if variable_name == 'Wind_onshore': results_wind_onshore[forecast_entsoe_column] = {'MAE': mae_entsoe, 'RMSE': rmse_entsoe, 'rMAE': rMAE_entsoe} elif variable_name == 'Wind_offshore': results_wind_offshore[forecast_entsoe_column] = {'MAE': mae_entsoe, 'RMSE': rmse_entsoe, 'rMAE': rMAE_entsoe} elif variable_name == 'Load': results_load[forecast_entsoe_column] = {'MAE': mae_entsoe, 'RMSE': rmse_entsoe, 'rMAE': rMAE_entsoe} elif variable_name == 'Solar': results_solar[forecast_entsoe_column] = {'MAE': mae_entsoe, 'RMSE': rmse_entsoe, 'rMAE': rMAE_entsoe} # Convert the dictionaries to DataFrames and sort by rMAE df_wind_onshore = pd.DataFrame.from_dict(results_wind_onshore, orient='index').sort_values(by='rMAE') df_wind_offshore = pd.DataFrame.from_dict(results_wind_offshore, orient='index').sort_values(by='rMAE') df_load = pd.DataFrame.from_dict(results_load, orient='index').sort_values(by='rMAE') df_solar = pd.DataFrame.from_dict(results_solar, orient='index').sort_values(by='rMAE') st.write("##### Wind Onshore:") st.dataframe(df_wind_onshore) st.write("##### Wind Offshore:") st.dataframe(df_wind_offshore) st.write("##### Load:") st.dataframe(df_load) st.write("##### Solar:") st.dataframe(df_solar) else: accuracy_metrics = pd.DataFrame(columns=['MAE', 'rMAE'], index=['Load', 'Solar', 'Wind Onshore', 'Wind Offshore']) for i in range(0, len(forecast_columns), 2): actual_col = forecast_columns[i] forecast_col = forecast_columns[i + 1] if forecast_col in data.columns: obs = data[actual_col] pred = data[forecast_col] error = pred - obs mae = round(np.mean(np.abs(error)),2) if 'Load' in actual_col: persistence = obs.shift(168) # Weekly persistence else: persistence = obs.shift(24) # Daily persistence # Using the whole year's data for rMAE calculations rmae = round(mae / np.mean(np.abs(obs - persistence)),2) row_label = 'Load' if 'Load' in actual_col else 'Solar' if 'Solar' in actual_col else 'Wind Offshore' if 'Wind_offshore' in actual_col else 'Wind Onshore' accuracy_metrics.loc[row_label] = [mae, rmae] accuracy_metrics.dropna(how='all', inplace=True)# Sort by rMAE (second column) accuracy_metrics.sort_values(by=accuracy_metrics.columns[1], ascending=True, inplace=True) accuracy_metrics = accuracy_metrics.round(4) col1, col2 = st.columns([3, 2]) with col1: st.dataframe(accuracy_metrics) with col2: st.markdown("""