Spaces:
Running
Running
import requests | |
import pandas as pd | |
from io import StringIO | |
import streamlit as st | |
import os | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import plotly.colors as pc | |
import numpy as np | |
from sklearn.metrics import mean_squared_error | |
from statsmodels.tsa.stattools import acf | |
from statsmodels.graphics.tsaplots import plot_acf | |
import matplotlib.pyplot as plt | |
from datetime import datetime | |
def get_current_time(): | |
now = datetime.now() | |
current_hour = now.hour | |
current_minute = now.minute | |
# Return the hour and a boolean indicating if it is after the 10th minute | |
return current_hour, current_minute >= 10 | |
##GET ALL FILES FROM GITHUB | |
def load_GitHub(github_token, file_name, hour, after_10_min): | |
url = f'https://raw.githubusercontent.com/margaridamascarenhas/Transparency_Data/main/{file_name}' | |
headers = {'Authorization': f'token {github_token}'} | |
response = requests.get(url, headers=headers) | |
if response.status_code == 200: | |
csv_content = StringIO(response.text) | |
df = pd.read_csv(csv_content) | |
if 'Date' in df.columns: | |
df['Date'] = pd.to_datetime(df['Date']) # Convert 'Date' column to datetime | |
df.set_index('Date', inplace=True) # Set 'Date' column as the index | |
#df.to_csv(file_name) | |
return df | |
else: | |
print(f"Failed to download {file_name}. Status code: {response.status_code}") | |
return None | |
def load_forecast(github_token, hour, after_10_min): | |
predictions_dict = {} | |
for hour in range(24): | |
file_name = f'Predictions_{hour}h.csv' | |
df = load_GitHub(github_token, file_name, hour, after_10_min) | |
if df is not None: | |
predictions_dict[file_name] = df | |
return predictions_dict | |
def convert_European_time(data, time_zone): | |
data.index = pd.to_datetime(data.index, utc=True) | |
data.index = data.index.tz_convert(time_zone) | |
data.index = data.index.tz_localize(None) | |
return data | |
def simplify_model_names(df): | |
# Define the mapping of complex names to simpler ones | |
replacements = { | |
r'\.LightGBMModel\.\dD\.TimeCov\.Temp\.Forecast_elia': '.LightGBM_with_Forecast_elia', | |
r'\.LightGBMModel\.\dD\.TimeCov\.Temp': '.LightGBM', | |
r'\.Naive\.\dD': '.Naive', | |
} | |
# Apply the replacements | |
for original, simplified in replacements.items(): | |
df.columns = df.columns.str.replace(original, simplified, regex=True) | |
return df | |
def simplify_model_names_in_index(df): | |
# Define the mapping of complex names to simpler ones | |
replacements = { | |
r'\.LightGBMModel\.\dD\.TimeCov\.Temp\.Forecast_elia': '.LightGBM_with_Forecast_elia', | |
r'\.LightGBMModel\.\dD\.TimeCov\.Temp': '.LightGBM', | |
r'\.Naive\.\dD': '.Naive', | |
} | |
# Apply the replacements to the DataFrame index | |
for original, simplified in replacements.items(): | |
df.index = df.index.str.replace(original, simplified, regex=True) | |
return df | |
github_token = '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' | |
if github_token: | |
hour, after_10_min=get_current_time() | |
forecast_dict = load_forecast(github_token, hour, after_10_min) | |
historical_forecast=load_GitHub(github_token, 'Historical_forecast.csv', hour, after_10_min) | |
Data_BE=load_GitHub(github_token, 'BE_Elia_Entsoe_UTC.csv', hour, after_10_min) | |
Data_FR=load_GitHub(github_token, 'FR_Entsoe_UTC.csv', hour, after_10_min) | |
Data_NL=load_GitHub(github_token, 'NL_Entsoe_UTC.csv', hour, after_10_min) | |
Data_DE=load_GitHub(github_token, 'DE_Entsoe_UTC.csv', hour, after_10_min) | |
Data_BE=convert_European_time(Data_BE, 'Europe/Brussels') | |
Data_FR=convert_European_time(Data_FR, 'Europe/Paris') | |
Data_NL=convert_European_time(Data_NL, 'Europe/Amsterdam') | |
Data_DE=convert_European_time(Data_DE, 'Europe/Berlin') | |
else: | |
print("Please enter your GitHub Personal Access Token to proceed.") | |
# Main layout of the app | |
col1, col2 = st.columns([5, 2]) # Adjust the ratio to better fit your layout needs | |
with col1: | |
st.title("Transparency++") | |
with col2: | |
upper_space = col2.empty() | |
upper_space = col2.empty() | |
col2_1, col2_2 = st.columns(2) # Create two columns within the right column for side-by-side images | |
with col2_1: | |
st.image("KU_Leuven_logo.png", width=100) # Adjust the path and width as needed | |
with col2_2: | |
st.image("energyville_logo.png", width=100) | |
upper_space.markdown(""" | |
| |
| |
""", unsafe_allow_html=True) | |
countries = { | |
'Overall': 'Overall', | |
'Netherlands': 'NL', | |
'Germany': 'DE', | |
'France': 'FR', | |
'Belgium': 'BE', | |
} | |
st.sidebar.header('Filters') | |
st.sidebar.subheader("Select Country") | |
st.sidebar.caption("Choose the country for which you want to display data or forecasts.") | |
selected_country = st.sidebar.selectbox('Select Country', list(countries.keys())) | |
# Ensure the date range provides two dates | |
# Sidebar with radio buttons for different sections | |
if selected_country != 'Overall': | |
st.sidebar.subheader("Section") | |
st.sidebar.caption("Select the type of information you want to explore.") | |
section = st.sidebar.radio('', ['Data Quality', 'Forecasts Quality', 'Insights'], index=1) | |
date_range = st.sidebar.date_input("Select Date Range for Metrics Calculation:", | |
value=(pd.to_datetime("2024-01-01"), pd.to_datetime(pd.Timestamp('today')))) | |
if len(date_range) == 2: | |
start_date = pd.Timestamp(date_range[0]) | |
end_date = pd.Timestamp(date_range[1]) | |
else: | |
st.error("Please select a valid date range.") | |
st.stop() | |
else: | |
section = None # No section is shown when "Overall" is selected | |
if selected_country == 'Overall': | |
data = None # You can set data to None or a specific dataset based on your logic | |
section = None # No section selected when "Overall" is chosen | |
else: | |
country_code = countries[selected_country] | |
if country_code == 'BE': | |
data = Data_BE | |
weather_columns = ['Temperature', 'Wind Speed Onshore', 'Wind Speed Offshore'] | |
data['Temperature'] = data['temperature_2m_8'] | |
data['Wind Speed Offshore'] = data['wind_speed_100m_4'] | |
data['Wind Speed Onshore'] = data['wind_speed_100m_8'] | |
elif country_code == 'DE': | |
data = Data_DE | |
weather_columns = ['Temperature', 'Wind Speed'] | |
data['Temperature'] = data['temperature_2m'] | |
data['Wind Speed'] = data['wind_speed_100m'] | |
elif country_code == 'NL': | |
data = Data_NL | |
weather_columns = ['Temperature', 'Wind Speed'] | |
data['Temperature'] = data['temperature_2m'] | |
data['Wind Speed'] = data['wind_speed_100m'] | |
elif country_code == 'FR': | |
data = Data_FR | |
weather_columns = ['Temperature', 'Wind Speed'] | |
data['Temperature'] = data['temperature_2m'] | |
data['Wind Speed'] = data['wind_speed_100m'] | |
def add_feature(df2, df_main): | |
#df_main.index = pd.to_datetime(df_main.index) | |
#df2.index = pd.to_datetime(df2.index) | |
df_combined = df_main.combine_first(df2) | |
last_date_df1 = df_main.index.max() | |
first_date_df2 = df2.index.min() | |
if first_date_df2 == last_date_df1 + pd.Timedelta(hours=1): | |
df_combined = pd.concat([df_main, df2[df2.index > last_date_df1]], axis=0) | |
#df_combined.reset_index(inplace=True) | |
return df_combined | |
#data.index = data.index.tz_localize('UTC') | |
forecast_columns = [ | |
'Load_entsoe','Load_forecast_entsoe','Wind_onshore_entsoe','Wind_onshore_forecast_entsoe','Wind_offshore_entsoe','Wind_offshore_forecast_entsoe','Solar_entsoe','Solar_forecast_entsoe'] | |
if section == 'Data Quality': | |
st.header('Data Quality') | |
st.write('The table below presents the data quality metrics for various energy-related datasets, focusing on the percentage of missing values and the occurrence of extreme or nonsensical values for the selected country.') | |
data_quality=data.iloc[:-28] | |
# Report % of missing values | |
missing_values = data_quality[forecast_columns].isna().mean() * 100 | |
missing_values = missing_values.round(2) | |
installed_capacities = { | |
'FR': { 'Solar': 17419, 'Wind Offshore': 1483, 'Wind Onshore': 22134}, | |
'DE': { 'Solar': 73821, 'Wind Offshore': 8386, 'Wind Onshore': 59915}, | |
'BE': { 'Solar': 8789, 'Wind Offshore': 2262, 'Wind Onshore': 3053}, | |
'NL': { 'Solar': 22590, 'Wind Offshore': 3220, 'Wind Onshore': 6190}, | |
} | |
if country_code not in installed_capacities: | |
st.error(f"Installed capacities not defined for country code '{country_code}'.") | |
st.stop() | |
# Report % of extreme, impossible values for the selected country | |
capacities = installed_capacities[country_code] | |
extreme_values = {} | |
for col in forecast_columns: | |
if 'Solar_entsoe' in col: | |
extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Solar'])).mean() * 100 | |
elif 'Solar_forecast_entsoe' in col: | |
extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Solar'])).mean() * 100 | |
elif 'Wind_onshore_entsoe' in col: | |
extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Wind Onshore'])).mean() * 100 | |
elif 'Wind_onshore_forecast_entsoe' in col: | |
extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Wind Onshore'])).mean() * 100 | |
elif 'Wind_offshore_entsoe' in col: | |
extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Wind Offshore'])).mean() * 100 | |
elif 'Wind_offshore_forecast_entsoe' in col: | |
extreme_values[col] = ((data_quality[col] < 0) | (data_quality[col] > capacities['Wind Offshore'])).mean() * 100 | |
elif 'Load_entsoe' in col: | |
extreme_values[col] = ((data_quality[col] < 0)).mean() * 100 | |
elif 'Load_forecast_entsoe' in col: | |
extreme_values[col] = ((data_quality[col] < 0)).mean() * 100 | |
extreme_values = pd.Series(extreme_values).round(2) | |
# Combine all metrics into one DataFrame | |
metrics_df = pd.DataFrame({ | |
'Missing Values (%)': missing_values, | |
'Extreme/Nonsensical Values (%)': extreme_values, | |
}) | |
st.markdown( | |
""" | |
<style> | |
.dataframe {font-size: 45px !important;} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
st.dataframe(metrics_df) | |
st.write('<b><u>Missing values (%)</u></b>: Percentage of missing values in the dataset', unsafe_allow_html=True) | |
st.write('<b><u>Extreme/Nonsensical values (%)</u></b>: Values that are considered implausible such as negative or out-of-bound values i.e., (generation<0) or (generation>capacity)', unsafe_allow_html=True) | |
# Section 2: Forecasts | |
elif section == 'Forecasts Quality': | |
st.header('Forecast Quality') | |
# Time series for last 1 week | |
last_week = data.loc[data.index >= (data.index[-1] - pd.Timedelta(days=7))] | |
st.write('The below plots show the time series of forecasts vs. observations provided by the ENTSO-E Transparency platform from the past week.') | |
num_per_var=2 | |
forecast_columns_line=forecast_columns | |
for i in range(0, len(forecast_columns_line), num_per_var): | |
actual_col = forecast_columns_line[i] | |
forecast_col = forecast_columns_line[i + 1] | |
if forecast_col in data.columns: | |
fig = go.Figure() | |
fig.add_trace(go.Scatter(x=last_week.index, y=last_week[actual_col], mode='lines', name='Actual')) | |
fig.add_trace(go.Scatter(x=last_week.index, y=last_week[forecast_col], mode='lines', name='Forecast ENTSO-E')) | |
fig.update_layout(title=f'Forecasts vs Actual for {actual_col}', xaxis_title='Date', yaxis_title='Value [MW]') | |
st.plotly_chart(fig) | |
# Scatter plots for error distribution | |
st.subheader('Error Distribution') | |
st.write('The below scatter plots show the error distribution of all three fields: Solar, Wind and Load between the selected date range') | |
data_2024 = data[data.index.year > 2023] | |
for i in range(0, len(forecast_columns), 2): | |
actual_col = forecast_columns[i] | |
forecast_col = forecast_columns[i + 1] | |
if forecast_col in data_2024.columns: | |
obs = data_2024[actual_col] | |
pred = data_2024[forecast_col] | |
error = pred - obs | |
fig = px.scatter(x=obs, y=pred, labels={'x': 'Observed [MW]', 'y': 'Predicted by ENTSO-E [MW]'}) | |
fig.update_layout(title=f'Error Distribution for {forecast_col}') | |
st.plotly_chart(fig) | |
st.subheader('Accuracy Metrics (Sorted by rMAE):') | |
output_text = f"The below metrics are calculated from the selected date range from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}. This interval can be adjusted from the sidebar." | |
st.write(output_text) | |
data = data.loc[start_date:end_date] | |
accuracy_metrics = pd.DataFrame(columns=['MAE', 'rMAE'], index=['Load', 'Solar', 'Wind Onshore', 'Wind Offshore']) | |
for i in range(0, len(forecast_columns), 2): | |
actual_col = forecast_columns[i] | |
forecast_col = forecast_columns[i + 1] | |
if forecast_col in data.columns: | |
obs = data[actual_col] | |
pred = data[forecast_col] | |
error = pred - obs | |
mae = round(np.mean(np.abs(error)),2) | |
if 'Load' in actual_col: | |
persistence = obs.shift(168) # Weekly persistence | |
else: | |
persistence = obs.shift(24) # Daily persistence | |
# Using the whole year's data for rMAE calculations | |
rmae = round(mae / np.mean(np.abs(obs - persistence)),2) | |
row_label = 'Load' if 'Load' in actual_col else 'Solar' if 'Solar' in actual_col else 'Wind Offshore' if 'Wind_offshore' in actual_col else 'Wind Onshore' | |
accuracy_metrics.loc[row_label] = [mae, rmae] | |
accuracy_metrics.dropna(how='all', inplace=True)# Sort by rMAE (second column) | |
accuracy_metrics.sort_values(by=accuracy_metrics.columns[1], ascending=True, inplace=True) | |
accuracy_metrics = accuracy_metrics.round(4) | |
col1, col2 = st.columns([3, 2]) | |
with col1: | |
st.dataframe(accuracy_metrics) | |
with col2: | |
st.markdown(""" | |
<style> | |
.big-font { | |
font-size: 20px; | |
font-weight: 500; | |
} | |
</style> | |
<div class="big-font"> | |
Equations | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(r""" | |
$\text{MAE} = \frac{1}{n}\sum_{i=1}^{n}|y_i - \hat{y}_i|$ | |
$\text{rMAE} = \frac{\text{MAE}}{MAE_{\text{Persistence Model}}}$ | |
""") | |
st.subheader('ACF plots of Errors') | |
st.write('The below plots show the ACF (Auto-Correlation Function) for the errors of all three data fields obtained from ENTSO-E: Solar, Wind and Load.') | |
for i in range(0, len(forecast_columns), 2): | |
actual_col = forecast_columns[i] | |
forecast_col = forecast_columns[i + 1] | |
if forecast_col in data.columns: | |
obs = data[actual_col] | |
pred = data[forecast_col] | |
error = pred - obs | |
st.write(f"**ACF of Errors for {actual_col}**") | |
fig, ax = plt.subplots(figsize=(10, 5)) | |
plot_acf(error.dropna(), ax=ax) | |
st.pyplot(fig) | |
acf_values = acf(error.dropna(), nlags=240) | |
# Section 3: Insights | |
elif section == 'Insights': | |
st.header("Insights") | |
st.write(""" | |
This section provides insights derived from the data and forecasts. | |
You can visualize trends, anomalies, and other important findings. | |
""") | |
# Scatter plots for correlation between wind, solar, and load | |
st.subheader('Correlation between Wind, Solar, Load and Weather Features') | |
st.write('The below scatter plots are made for checking whether there exists a correlation between the data fields obtained: Solar, Wind, Load and Weather Features.') | |
selected_columns=['Load_entsoe', 'Solar_entsoe', 'Wind_offshore_entsoe', 'Wind_onshore_entsoe'] + weather_columns | |
selected_df=data[selected_columns] | |
selected_df.columns = [col.replace('_entsoe', '').replace('_', ' ') for col in selected_df.columns] | |
selected_df = selected_df.dropna() | |
print(selected_df) | |
sns.set_theme(style="ticks") | |
pairplot_fig = sns.pairplot(selected_df) | |
# Display the pairplot in Streamlit | |
st.pyplot(pairplot_fig) | |
elif selected_country == 'Overall': | |
st.subheader("Net Load Error Map") | |
st.write(""" | |
The net load error map highlights the error in the forecasted versus actual net load for each country. | |
Hover over each country to see details on the latest net load error and the timestamp of the last recorded data. | |
""") | |
def plot_net_load_error_map(data_dict): | |
# Define forecast columns used in calculation | |
def calculate_net_load_error(df): | |
filter_df = df[forecast_columns].dropna() | |
net_load = filter_df['Load_entsoe'] - filter_df['Wind_onshore_entsoe'] - filter_df['Wind_offshore_entsoe'] - filter_df['Solar_entsoe'] | |
net_load_forecast = filter_df['Load_forecast_entsoe'] - filter_df['Wind_onshore_forecast_entsoe'] - filter_df['Wind_offshore_forecast_entsoe'] - filter_df['Solar_forecast_entsoe'] | |
error = (net_load - net_load_forecast).iloc[-1] | |
date = filter_df.index[-1].strftime("%Y-%m-%d %H:%M") # Get the latest date in string format | |
return error, date | |
# Calculate net load errors and dates for each country | |
net_load_errors = {country_name: calculate_net_load_error(data) for country_name, data in data_dict.items()} | |
# Create DataFrame for Folium with additional date column | |
df_net_load_error = pd.DataFrame({ | |
'country': list(net_load_errors.keys()), | |
'net_load_error': [v[0] for v in net_load_errors.values()], | |
'date': [v[1] for v in net_load_errors.values()] | |
}) | |
# Load the GeoJSON file | |
geojson_url = "https://raw.githubusercontent.com/python-visualization/folium/master/examples/data/world-countries.json" | |
geo_data = requests.get(geojson_url).json() | |
# Filter GeoJSON to only include the selected countries | |
selected_countries = list(data_dict.keys()) # Get the list of selected countries (Belgium, France, Germany, Netherlands) | |
filtered_geojson = { | |
"type": "FeatureCollection", | |
"features": [feature for feature in geo_data["features"] if feature["properties"]["name"] in selected_countries] | |
} | |
# Merge the geojson with the error and date data | |
for feature in filtered_geojson["features"]: | |
country_name = feature["properties"]["name"] | |
row = df_net_load_error[df_net_load_error['country'] == country_name] | |
if not row.empty: | |
feature["properties"]["net_load_error"] = row.iloc[0]["net_load_error"] | |
feature["properties"]["date"] = row.iloc[0]["date"] | |
# Initialize the Folium map centered on Central Europe | |
m = folium.Map(location=[51, 10], zoom_start=5, tiles="cartodb positron") | |
# Add choropleth layer to map net load errors by country | |
folium.Choropleth( | |
geo_data=filtered_geojson, | |
name="choropleth", | |
data=df_net_load_error, | |
columns=["country", "net_load_error"], | |
key_on="feature.properties.name", | |
fill_color="RdYlBu", # Use a more vibrant color palette | |
fill_opacity=0.7, | |
line_opacity=0.5, | |
line_color="black", # Neutral border color | |
legend_name="Net Load Error" | |
).add_to(m) | |
# Add a GeoJson layer with custom tooltip for country, error, and date | |
folium.GeoJson( | |
filtered_geojson, | |
style_function=lambda x: {'fillOpacity': 0, 'color': 'black', 'weight': 0}, | |
tooltip=folium.GeoJsonTooltip( | |
fields=["name", "net_load_error", "date"], | |
aliases=["Country:", "Net Load Error:", "Date:"], | |
localize=True | |
) | |
).add_to(m) | |
# Display Folium map in Streamlit | |
st_folium(m, width=700, height=600) | |
# Data dictionary with full country names | |
data_dict = { | |
'Belgium': Data_BE, | |
'France': Data_FR, | |
'Germany': Data_DE, | |
'Netherlands': Data_NL | |
} | |
# Call the function to plot the map | |
plot_net_load_error_map(data_dict) | |
st.subheader("rMAE of Forecasts published on ENTSO-E TP") | |
st.write(""" | |
The radar chart below compares the forecast accuracy across Load, Onshore Wind, Offshore Wind, and Solar for each country. | |
""") | |
def calculate_mae(actual, forecast): | |
return np.mean(np.abs(actual - forecast)) | |
# Function to calculate persistence MAE | |
def calculate_persistence_mae(data, shift_hours): | |
return np.mean(np.abs(data - data.shift(shift_hours))) | |
# Function to calculate rMAE for each country | |
def calculate_rmae_for_country(df): | |
rmae = {} | |
rmae['Load'] = calculate_mae(df['Load_entsoe'], df['Load_forecast_entsoe']) / calculate_persistence_mae(df['Load_entsoe'], 168) | |
rmae['Wind_onshore'] = calculate_mae(df['Wind_onshore_entsoe'], df['Wind_onshore_forecast_entsoe']) / calculate_persistence_mae(df['Wind_onshore_entsoe'], 24) | |
rmae['Wind_offshore'] = calculate_mae(df['Wind_offshore_entsoe'], df['Wind_offshore_forecast_entsoe']) / calculate_persistence_mae(df['Wind_offshore_entsoe'], 24) | |
rmae['Solar'] = calculate_mae(df['Solar_entsoe'], df['Solar_forecast_entsoe']) / calculate_persistence_mae(df['Solar_entsoe'], 24) | |
return rmae | |
# Function to create rMAE DataFrame | |
def create_rmae_dataframe(data_dict): | |
rmae_values = {'Country': [], 'Load': [], 'Wind_onshore': [], 'Wind_offshore': [], 'Solar': []} | |
for country_name, df in data_dict.items(): | |
df_filtered = df[forecast_columns].dropna() | |
rmae = calculate_rmae_for_country(df_filtered) | |
rmae_values['Country'].append(country_name) | |
for key in rmae: | |
rmae_values[key].append(rmae[key]) | |
return pd.DataFrame(rmae_values) | |
# Function to plot radar chart | |
def plot_rmae_radar_chart(rmae_df): | |
fig = go.Figure() | |
angles = ['Load', 'Wind_onshore', 'Wind_offshore', 'Solar'] | |
for _, row in rmae_df.iterrows(): | |
fig.add_trace(go.Scatterpolar(r=[row[angle] for angle in angles], theta=angles, fill='toself', name=row['Country'])) | |
fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 2])), showlegend=True, title="rMAE Radar Chart by Country") | |
st.plotly_chart(fig) | |
# Main execution to create and display radar plot | |
rmae_df = create_rmae_dataframe(data_dict) | |
plot_rmae_radar_chart(rmae_df) | |