import requests import pandas as pd from io import StringIO import streamlit as st import os import plotly.express as px import plotly.graph_objects as go import plotly.colors as pc import numpy as np from sklearn.metrics import mean_squared_error from statsmodels.tsa.stattools import acf from statsmodels.graphics.tsaplots import plot_acf import matplotlib.pyplot as plt from datetime import datetime def get_current_time(): now = datetime.now() current_hour = now.hour current_minute = now.minute # Return the hour and a boolean indicating if it is after the 10th minute return current_hour, current_minute >= 10 ##GET ALL FILES FROM GITHUB @st.cache_data(show_spinner=False) def load_GitHub(github_token, file_name, hour, after_10_min): url = f'https://raw.githubusercontent.com/margaridamascarenhas/Transparency_Data/main/{file_name}' headers = {'Authorization': f'token {github_token}'} response = requests.get(url, headers=headers) if response.status_code == 200: csv_content = StringIO(response.text) df = pd.read_csv(csv_content) if 'Date' in df.columns: df['Date'] = pd.to_datetime(df['Date']) # Convert 'Date' column to datetime df.set_index('Date', inplace=True) # Set 'Date' column as the index #df.to_csv(file_name) return df else: print(f"Failed to download {file_name}. Status code: {response.status_code}") return None @st.cache_data(show_spinner=False) def load_forecast(github_token, hour, after_10_min): predictions_dict = {} for hour in range(24): file_name = f'Predictions_{hour}h.csv' df = load_GitHub(github_token, file_name, hour, after_10_min) if df is not None: predictions_dict[file_name] = df return predictions_dict def convert_European_time(data, time_zone): data.index = pd.to_datetime(data.index, utc=True) data.index = data.index.tz_convert(time_zone) data.index = data.index.tz_localize(None) return data def simplify_model_names(df): # Define the mapping of complex names to simpler ones replacements = { r'\.LightGBMModel\.\dD\.TimeCov\.Temp\.Forecast_elia': '.LightGBM_with_Forecast_elia', r'\.LightGBMModel\.\dD\.TimeCov\.Temp': '.LightGBM', r'\.Naive\.\dD': '.Naive', } # Apply the replacements for original, simplified in replacements.items(): df.columns = df.columns.str.replace(original, simplified, regex=True) return df def simplify_model_names_in_index(df): # Define the mapping of complex names to simpler ones replacements = { r'\.LightGBMModel\.\dD\.TimeCov\.Temp\.Forecast_elia': '.LightGBM_with_Forecast_elia', r'\.LightGBMModel\.\dD\.TimeCov\.Temp': '.LightGBM', r'\.Naive\.\dD': '.Naive', } # Apply the replacements to the DataFrame index for original, simplified in replacements.items(): df.index = df.index.str.replace(original, simplified, regex=True) return df github_token = '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' if github_token: hour, after_10_min=get_current_time() forecast_dict = load_forecast(github_token, hour, after_10_min) historical_forecast=load_GitHub(github_token, 'Historical_forecast.csv', hour, after_10_min) Data_BE=load_GitHub(github_token, 'BE_Elia_Entsoe_UTC.csv', hour, after_10_min) Data_FR=load_GitHub(github_token, 'FR_Entsoe_UTC.csv', hour, after_10_min) Data_NL=load_GitHub(github_token, 'NL_Entsoe_UTC.csv', hour, after_10_min) Data_DE=load_GitHub(github_token, 'DE_Entsoe_UTC.csv', hour, after_10_min) Data_BE=convert_European_time(Data_BE, 'Europe/Brussels') Data_FR=convert_European_time(Data_FR, 'Europe/Paris') Data_NL=convert_European_time(Data_NL, 'Europe/Amsterdam') Data_DE=convert_European_time(Data_DE, 'Europe/Berlin') else: print("Please enter your GitHub Personal Access Token to proceed.") # Main layout of the app col1, col2 = st.columns([5, 2]) # Adjust the ratio to better fit your layout needs with col1: st.title("Transparency++") with col2: upper_space = col2.empty() upper_space = col2.empty() col2_1, col2_2 = st.columns(2) # Create two columns within the right column for side-by-side images with col2_1: st.image("KU_Leuven_logo.png", width=100) # Adjust the path and width as needed with col2_2: st.image("energyville_logo.png", width=100) upper_space.markdown(""" """, unsafe_allow_html=True) countries = { 'Overall': 'Overall', 'Netherlands': 'NL', 'Germany': 'DE', 'France': 'FR', 'Belgium': 'BE', } st.sidebar.header('Filters') st.sidebar.subheader("Select Country") st.sidebar.caption("Choose the country for which you want to display data or forecasts.") selected_country = st.sidebar.selectbox('Select Country', list(countries.keys())) # Ensure the date range provides two dates # Sidebar with radio buttons for different sections if selected_country != 'Overall': st.sidebar.subheader("Section") st.sidebar.caption("Select the type of information you want to explore.") section = st.sidebar.radio('', ['Data Quality', 'Forecasts Quality', 'Insights'], index=1) date_range = st.sidebar.date_input("Select Date Range for Metrics Calculation:", value=(pd.to_datetime("2024-01-01"), pd.to_datetime(pd.Timestamp('today')))) if len(date_range) == 2: start_date = pd.Timestamp(date_range[0]) end_date = pd.Timestamp(date_range[1]) else: st.error("Please select a valid date range.") st.stop() else: section = None # No section is shown when "Overall" is selected if selected_country == 'Overall': data = None # You can set data to None or a specific dataset based on your logic section = None # No section selected when "Overall" is chosen else: country_code = countries[selected_country] if country_code == 'BE': data = Data_BE weather_columns = ['Temperature', 'Wind Speed Onshore', 'Wind Speed Offshore'] data['Temperature'] = data['temperature_2m_8'] data['Wind Speed Offshore'] = data['wind_speed_100m_4'] data['Wind Speed Onshore'] = data['wind_speed_100m_8'] elif country_code == 'DE': data = Data_DE weather_columns = ['Temperature', 'Wind Speed'] data['Temperature'] = data['temperature_2m'] data['Wind Speed'] = data['wind_speed_100m'] elif country_code == 'NL': data = Data_NL weather_columns = ['Temperature', 'Wind Speed'] data['Temperature'] = data['temperature_2m'] data['Wind Speed'] = data['wind_speed_100m'] elif country_code == 'FR': data = Data_FR weather_columns = ['Temperature', 'Wind Speed'] data['Temperature'] = data['temperature_2m'] data['Wind Speed'] = data['wind_speed_100m'] def add_feature(df2, df_main): #df_main.index = pd.to_datetime(df_main.index) #df2.index = pd.to_datetime(df2.index) df_combined = df_main.combine_first(df2) last_date_df1 = df_main.index.max() first_date_df2 = df2.index.min() if first_date_df2 == last_date_df1 + pd.Timedelta(hours=1): df_combined = pd.concat([df_main, df2[df2.index > last_date_df1]], axis=0) #df_combined.reset_index(inplace=True) return df_combined #data.index = data.index.tz_localize('UTC') forecast_columns = [ 'Load_entsoe','Load_forecast_entsoe','Wind_onshore_entsoe','Wind_onshore_forecast_entsoe','Wind_offshore_entsoe','Wind_offshore_forecast_entsoe','Solar_entsoe','Solar_forecast_entsoe'] if section == 'Data Quality': st.header('Data Quality') st.write('The table below presents the data quality metrics for various energy-related datasets, focusing on the percentage of missing values and the occurrence of extreme or nonsensical values for the selected country.') data_quality=data.iloc[:-28] # Report % of missing values missing_values = data_quality[forecast_columns].isna().mean() * 100 missing_values = missing_values.round(2) installed_capacities = { 'FR': { 'Solar': 17419, 'Wind Offshore': 1483, 'Wind Onshore': 22134}, 'DE': { 'Solar': 73821, 'Wind Offshore': 8386, 'Wind Onshore': 59915}, 'BE': { 'Solar': 8789, 'Wind Offshore': 2262, 'Wind Onshore': 3053}, 'NL': { 'Solar': 22590, 'Wind Offshore': 3220, 'Wind Onshore': 6190}, } if country_code not in installed_capacities: st.error(f"Installed capacities not defined for country code '{country_code}'.") st.stop() # Report % of extreme, impossible values for the selected country capacities = installed_capacities[country_code] extreme_values = {} for col in forecast_columns: if 'Solar_entsoe' in col: extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Solar'])).mean() * 100 elif 'Solar_forecast_entsoe' in col: extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Solar'])).mean() * 100 elif 'Wind_onshore_entsoe' in col: extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Wind Onshore'])).mean() * 100 elif 'Wind_onshore_forecast_entsoe' in col: extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Wind Onshore'])).mean() * 100 elif 'Wind_offshore_entsoe' in col: extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Wind Offshore'])).mean() * 100 elif 'Wind_offshore_forecast_entsoe' in col: extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Wind Offshore'])).mean() * 100 elif 'Load_entsoe' in col: extreme_values[col] = ((data_quality[col] < 0)).mean() * 100 elif 'Load_forecast_entsoe' in col: extreme_values[col] = ((data_quality[col] < 0)).mean() * 100 extreme_values = pd.Series(extreme_values).round(2) # Combine all metrics into one DataFrame metrics_df = pd.DataFrame({ 'Missing Values (%)': missing_values, 'Extreme/Nonsensical Values (%)': extreme_values, }) st.markdown( """ """, unsafe_allow_html=True ) st.dataframe(metrics_df) st.write('Missing values (%): Percentage of missing values in the dataset', unsafe_allow_html=True) st.write('Extreme/Nonsensical values (%): Values that are considered implausible such as negative or out-of-bound values i.e., (generation<0) or (generation>capacity)', unsafe_allow_html=True) # Section 2: Forecasts elif section == 'Forecasts Quality': st.header('Forecast Quality') # Time series for last 1 week last_week = data.loc[data.index >= (data.index[-1] - pd.Timedelta(days=7))] st.write('The below plots show the time series of forecasts vs. observations provided by the ENTSO-E Transparency platform from the past week.') num_per_var=2 forecast_columns_line=forecast_columns for i in range(0, len(forecast_columns_line), num_per_var): actual_col = forecast_columns_line[i] forecast_col = forecast_columns_line[i + 1] if forecast_col in data.columns: fig = go.Figure() fig.add_trace(go.Scatter(x=last_week.index, y=last_week[actual_col], mode='lines', name='Actual')) fig.add_trace(go.Scatter(x=last_week.index, y=last_week[forecast_col], mode='lines', name='Forecast ENTSO-E')) fig.update_layout(title=f'Forecasts vs Actual for {actual_col}', xaxis_title='Date', yaxis_title='Value [MW]') st.plotly_chart(fig) # Scatter plots for error distribution st.subheader('Error Distribution') st.write('The below scatter plots show the error distribution of all three fields: Solar, Wind and Load between the selected date range') data_2024 = data[data.index.year > 2023] for i in range(0, len(forecast_columns), 2): actual_col = forecast_columns[i] forecast_col = forecast_columns[i + 1] if forecast_col in data_2024.columns: obs = data_2024[actual_col] pred = data_2024[forecast_col] error = pred - obs fig = px.scatter(x=obs, y=pred, labels={'x': 'Observed [MW]', 'y': 'Predicted by ENTSO-E [MW]'}) fig.update_layout(title=f'Error Distribution for {forecast_col}') st.plotly_chart(fig) st.subheader('Accuracy Metrics (Sorted by rMAE):') output_text = f"The below metrics are calculated from the selected date range from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}. This interval can be adjusted from the sidebar." st.write(output_text) data = data.loc[start_date:end_date] accuracy_metrics = pd.DataFrame(columns=['MAE', 'rMAE'], index=['Load', 'Solar', 'Wind Onshore', 'Wind Offshore']) for i in range(0, len(forecast_columns), 2): actual_col = forecast_columns[i] forecast_col = forecast_columns[i + 1] if forecast_col in data.columns: obs = data[actual_col] pred = data[forecast_col] error = pred - obs mae = round(np.mean(np.abs(error)),2) if 'Load' in actual_col: persistence = obs.shift(168) # Weekly persistence else: persistence = obs.shift(24) # Daily persistence # Using the whole year's data for rMAE calculations rmae = round(mae / np.mean(np.abs(obs - persistence)),2) row_label = 'Load' if 'Load' in actual_col else 'Solar' if 'Solar' in actual_col else 'Wind Offshore' if 'Wind_offshore' in actual_col else 'Wind Onshore' accuracy_metrics.loc[row_label] = [mae, rmae] accuracy_metrics.dropna(how='all', inplace=True)# Sort by rMAE (second column) accuracy_metrics.sort_values(by=accuracy_metrics.columns[1], ascending=True, inplace=True) accuracy_metrics = accuracy_metrics.round(4) col1, col2 = st.columns([3, 2]) with col1: st.dataframe(accuracy_metrics) with col2: st.markdown("""