Spaces:
Runtime error
Runtime error
import os | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import matplotlib as mpl | |
import pycaret | |
import streamlit as st | |
from streamlit_option_menu import option_menu | |
import PIL | |
from PIL import Image | |
from PIL import ImageColor | |
from PIL import ImageDraw | |
from PIL import ImageFont | |
def main(): | |
hide_streamlit_style = """ | |
<style> | |
#MainMenu {visibility: hidden;} | |
footer {visibility: hidden;} | |
</style> | |
""" | |
st.markdown(hide_streamlit_style, unsafe_allow_html=True) | |
with st.sidebar: | |
image = Image.open('itaca_logo.png') | |
st.image(image, width=150) #,use_column_width=True) | |
page = option_menu(menu_title='Menu', | |
menu_icon="robot", | |
options=["Clustering Analysis", | |
"Anomaly Detection"], | |
icons=["chat-dots", | |
"key"], | |
default_index=0 | |
) | |
# Additional section below the option menu | |
# st.markdown("---") # Add a separator line | |
st.header("Settings") | |
num_lines = st.text_input("% of lines to be processed:", value=100) | |
graph_select = st.checkbox("Show Graphics", value= True) | |
feat_imp_select = st.checkbox("Feature Importance", value= False) | |
# Define the options for the dropdown list | |
numclusters = [2, 3, 4, 5, 6] | |
selected_clusters = st.slider("Choose a number of clusters", min_value=2, max_value=10, value=4) | |
p_remove_multicollinearity = st.checkbox("Remove Multicollinearity", value=False) | |
p_multicollinearity_threshold = st.slider("Choose multicollinearity thresholds", min_value=0.0, max_value=1.0, value=0.9) | |
# p_remove_outliers = st.checkbox("Remove Outliers", value=False) | |
# p_outliers_method = st.selectbox ("Choose an Outlier Method", ["iforest", "ee", "lof"]) | |
p_transformation = st.checkbox("Choose Power Transform", value = False) | |
p_normalize = st.checkbox("Choose Normalize", value = False) | |
p_pca = st.checkbox("Choose PCA", value = False) | |
p_pca_method = st.selectbox ("Choose a PCA Method", ["linear", "kernel", "incremental"]) | |
st.title('ITACA Insurance Core AI Module') | |
if page == "Clustering Analysis": | |
st.header('Clustering Analysis') | |
st.write( | |
""" | |
""" | |
) | |
# import pycaret unsupervised models | |
from pycaret.clustering import setup, create_model, assign_model, pull, plot_model | |
# import ClusteringExperiment | |
from pycaret.clustering import ClusteringExperiment | |
# Display the list of CSV files | |
directory = "./" | |
all_files = os.listdir(directory) | |
# Filter files to only include CSV files | |
csv_files = [file for file in all_files if file.endswith(".csv")] | |
# Select a CSV file from the list | |
selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files) | |
# Upload the CSV file | |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
# Define the unsupervised model | |
clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch'] | |
selected_model = st.selectbox("Choose a clustering model", clusteringmodel) | |
# Read and display the CSV file | |
if selected_csv != "None" or uploaded_file is not None: | |
if uploaded_file: | |
try: | |
delimiter = ',' | |
insurance_claims = pd.read_csv (uploaded_file, sep=delimiter) | |
except ValueError: | |
delimiter = '|' | |
insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1') | |
else: | |
insurance_claims = pd.read_csv(selected_csv) | |
num_rows = int(insurance_claims.shape[0]*int(num_lines)/100) | |
insurance_claims_reduced = insurance_claims.head(num_rows) | |
st.write("Rows to be processed: " + str(num_rows)) | |
all_columns = insurance_claims_reduced.columns.tolist() | |
selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns) | |
insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy() | |
st.header("Inference Description") | |
insurance_claims_reduced.describe().T | |
cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns | |
num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns | |
# insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4)) | |
# Calculate the correlation matrix | |
corr_matrix = insurance_claims_reduced[num_col].corr() | |
# Create a Matplotlib figure | |
fig, ax = plt.subplots(figsize=(12, 8)) | |
# Create a heatmap using seaborn | |
st.header("Heat Map") | |
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax) | |
# Set the title for the heatmap | |
ax.set_title('Correlation Heatmap') | |
# Display the heatmap in Streamlit | |
st.pyplot(fig) | |
if st.button("Prediction"): | |
#insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy() | |
s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold, | |
# remove_outliers=p_remove_outliers, outliers_method=p_outliers_method, | |
transformation=p_transformation, | |
normalize=p_normalize, pca=p_pca, pca_method=p_pca_method) | |
exp_clustering = ClusteringExperiment() | |
# init setup on exp | |
exp_clustering.setup(insurance_claims_reduced, session_id = 123) | |
with st.spinner("Analyzing..."): | |
# train kmeans model | |
cluster_model = create_model(selected_model, num_clusters = selected_clusters) | |
cluster_model_2 = assign_model(cluster_model) | |
# Calculate summary statistics for each cluster | |
cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max', | |
'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)), | |
('quantile_75', lambda x: x.quantile(0.75)), 'skew']) | |
st.header("Cluster Summary") | |
cluster_summary | |
st.header("Assign Model") | |
cluster_model_2 | |
# all_metrics = get_metrics() | |
# all_metrics | |
st.header("Clustering Metrics") | |
cluster_results = pull() | |
cluster_results | |
if graph_select: | |
st.header("Clustering Plots") | |
# plot pca cluster plot | |
plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit') | |
if selected_model != 'ap': | |
plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit') | |
if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'): | |
plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit') | |
if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'): | |
plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit') | |
if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'): | |
plot_model(cluster_model, plot = 'distance', display_format = 'streamlit') | |
if selected_model != 'ap': | |
plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit') | |
# Create a Classification Model to extract feature importance | |
if feat_imp_select: | |
st.header("Feature Importance") | |
from pycaret.classification import setup, create_model, get_config | |
s = setup(cluster_model_2, target = 'Cluster') | |
lr = create_model('lr') | |
# this is how you can recreate the table | |
feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False) | |
# sort by feature importance value and filter top 10 | |
feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10) | |
# Display the filtered table in Streamlit | |
# st.dataframe(feat_imp) | |
# Display the filtered table as a bar chart in Streamlit | |
st.bar_chart(feat_imp.set_index('Feature')) | |
elif page == "Anomaly Detection": | |
st.header('Anomaly Detection') | |
st.write( | |
""" | |
""" | |
) | |
# import pycaret anomaly | |
from pycaret.anomaly import setup, create_model, assign_model, pull, plot_model | |
# import AnomalyExperiment | |
from pycaret.anomaly import AnomalyExperiment | |
# Display the list of CSV files | |
directory = "./" | |
all_files = os.listdir(directory) | |
# Filter files to only include CSV files | |
csv_files = [file for file in all_files if file.endswith(".csv")] | |
# Select a CSV file from the list | |
selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files) | |
# Upload the CSV file | |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
# Define the unsupervised model | |
anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos'] | |
selected_model = st.selectbox("Choose an anomaly model", anomalymodel) | |
# Read and display the CSV file | |
if selected_csv != "None" or uploaded_file is not None: | |
if uploaded_file: | |
try: | |
delimiter = ',' | |
insurance_claims = pd.read_csv (uploaded_file, sep=delimiter) | |
except ValueError: | |
delimiter = '|' | |
insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1') | |
else: | |
insurance_claims = pd.read_csv(selected_csv) | |
num_rows = int(insurance_claims.shape[0]*int(num_lines)/100) | |
insurance_claims_reduced = insurance_claims.head(num_rows) | |
st.write("Rows to be processed: " + str(num_rows)) | |
all_columns = insurance_claims_reduced.columns.tolist() | |
selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns) | |
insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy() | |
if st.button("Prediction"): | |
s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold, | |
# remove_outliers=p_remove_outliers, outliers_method=p_outliers_method, | |
transformation=p_transformation, | |
normalize=p_normalize, pca=p_pca, pca_method=p_pca_method) | |
exp_anomaly = AnomalyExperiment() | |
# init setup on exp | |
exp_anomaly.setup(insurance_claims_reduced, session_id = 123) | |
with st.spinner("Analyzing..."): | |
# train model | |
anomaly_model = create_model(selected_model) | |
st.header("Assign Model") | |
anomaly_model_2 = assign_model(anomaly_model) | |
anomaly_model_2 | |
st.header("Anomaly Metrics") | |
anomaly_results = pull() | |
anomaly_results | |
if graph_select: | |
# plot | |
st.header("Anomaly Plots") | |
plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit') | |
plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit') | |
if feat_imp_select: | |
# Create a Classification Model to extract feature importance | |
st.header("Feature Importance") | |
from pycaret.classification import setup, create_model, get_config | |
s = setup(anomaly_model_2, target = 'Anomaly') | |
lr = create_model('lr') | |
# this is how you can recreate the table | |
feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False) | |
# sort by feature importance value and filter top 10 | |
feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10) | |
# Display the filtered table in Streamlit | |
# st.dataframe(feat_imp) | |
# Display the filtered table as a bar chart in Streamlit | |
st.bar_chart(feat_imp.set_index('Feature')) | |
try: | |
main() | |
except Exception as e: | |
st.sidebar.error(f"An error occurred: {e}") |