import os import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import matplotlib as mpl import pycaret import streamlit as st from streamlit_option_menu import option_menu import PIL from PIL import Image from PIL import ImageColor from PIL import ImageDraw from PIL import ImageFont def main(): hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) with st.sidebar: image = Image.open('itaca_logo.png') st.image(image, width=150) #,use_column_width=True) page = option_menu(menu_title='Menu', menu_icon="robot", options=["Clustering Analysis", "Anomaly Detection"], icons=["chat-dots", "key"], default_index=0 ) # Additional section below the option menu # st.markdown("---") # Add a separator line st.header("Settings") num_lines = st.text_input("% of lines to be processed:", value=100) graph_select = st.checkbox("Show Graphics", value= True) feat_imp_select = st.checkbox("Feature Importance", value= False) # Define the options for the dropdown list numclusters = [2, 3, 4, 5, 6] selected_clusters = st.slider("Choose a number of clusters", min_value=2, max_value=10, value=4) p_remove_multicollinearity = st.checkbox("Remove Multicollinearity", value=False) p_multicollinearity_threshold = st.slider("Choose multicollinearity thresholds", min_value=0.0, max_value=1.0, value=0.9) # p_remove_outliers = st.checkbox("Remove Outliers", value=False) # p_outliers_method = st.selectbox ("Choose an Outlier Method", ["iforest", "ee", "lof"]) p_transformation = st.checkbox("Choose Power Transform", value = False) p_normalize = st.checkbox("Choose Normalize", value = False) p_pca = st.checkbox("Choose PCA", value = False) p_pca_method = st.selectbox ("Choose a PCA Method", ["linear", "kernel", "incremental"]) st.title('ITACA Insurance Core AI Module') if page == "Clustering Analysis": st.header('Clustering Analysis') st.write( """ """ ) # import pycaret unsupervised models from pycaret.clustering import setup, create_model, assign_model, pull, plot_model # import ClusteringExperiment from pycaret.clustering import ClusteringExperiment # Display the list of CSV files directory = "./" all_files = os.listdir(directory) # Filter files to only include CSV files csv_files = [file for file in all_files if file.endswith(".csv")] # Select a CSV file from the list selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files) # Upload the CSV file uploaded_file = st.file_uploader("Choose a CSV file", type="csv") # Define the unsupervised model clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch'] selected_model = st.selectbox("Choose a clustering model", clusteringmodel) # Read and display the CSV file if selected_csv != "None" or uploaded_file is not None: if uploaded_file: try: delimiter = ',' insurance_claims = pd.read_csv (uploaded_file, sep=delimiter) except ValueError: delimiter = '|' insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1') else: insurance_claims = pd.read_csv(selected_csv) num_rows = int(insurance_claims.shape[0]*int(num_lines)/100) insurance_claims_reduced = insurance_claims.head(num_rows) st.write("Rows to be processed: " + str(num_rows)) all_columns = insurance_claims_reduced.columns.tolist() selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns) st.header("Inference Description") insurance_claims_reduced.describe().T cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4)) # Calculate the correlation matrix corr_matrix = insurance_claims_reduced[num_col].corr() # Create a Matplotlib figure fig, ax = plt.subplots(figsize=(12, 8)) # Create a heatmap using seaborn st.header("Heat Map") sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax) # Set the title for the heatmap ax.set_title('Correlation Heatmap') # Display the heatmap in Streamlit st.pyplot(fig) if st.button("Prediction"): insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy() s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold, # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method, transformation=p_transformation, normalize=p_normalize, pca=p_pca, pca_method=p_pca_method) exp_clustering = ClusteringExperiment() # init setup on exp exp_clustering.setup(insurance_claims_reduced, session_id = 123) with st.spinner("Analyzing..."): # train kmeans model cluster_model = create_model(selected_model, num_clusters = selected_clusters) cluster_model_2 = assign_model(cluster_model) # Calculate summary statistics for each cluster cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)), ('quantile_75', lambda x: x.quantile(0.75)), 'skew']) st.header("Cluster Summary") cluster_summary st.header("Assign Model") cluster_model_2 # all_metrics = get_metrics() # all_metrics st.header("Clustering Metrics") cluster_results = pull() cluster_results if graph_select: st.header("Clustering Plots") # plot pca cluster plot plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit') if selected_model != 'ap': plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit') if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'): plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit') if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'): plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit') if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'): plot_model(cluster_model, plot = 'distance', display_format = 'streamlit') if selected_model != 'ap': plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit') # Create a Classification Model to extract feature importance if feat_imp_select: st.header("Feature Importance") from pycaret.classification import setup, create_model, get_config s = setup(cluster_model_2, target = 'Cluster') lr = create_model('lr') # this is how you can recreate the table feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False) # sort by feature importance value and filter top 10 feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10) # Display the filtered table in Streamlit # st.dataframe(feat_imp) # Display the filtered table as a bar chart in Streamlit st.bar_chart(feat_imp.set_index('Feature')) elif page == "Anomaly Detection": st.header('Anomaly Detection') st.write( """ """ ) # import pycaret anomaly from pycaret.anomaly import setup, create_model, assign_model, pull, plot_model # import AnomalyExperiment from pycaret.anomaly import AnomalyExperiment # Display the list of CSV files directory = "./" all_files = os.listdir(directory) # Filter files to only include CSV files csv_files = [file for file in all_files if file.endswith(".csv")] # Select a CSV file from the list selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files) # Upload the CSV file uploaded_file = st.file_uploader("Choose a CSV file", type="csv") # Define the unsupervised model anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos'] selected_model = st.selectbox("Choose an anomaly model", anomalymodel) # Read and display the CSV file if selected_csv != "None" or uploaded_file is not None: if uploaded_file: try: delimiter = ',' insurance_claims = pd.read_csv (uploaded_file, sep=delimiter) except ValueError: delimiter = '|' insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1') else: insurance_claims = pd.read_csv(selected_csv) num_rows = int(insurance_claims.shape[0]*int(num_lines)/100) insurance_claims = insurance_claims.head(num_rows) st.write("Rows to be processed: " + str(num_rows)) all_columns = insurance_claims.columns.tolist() selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns) if st.button("Prediction"): insurance_claims = insurance_claims[selected_columns].copy() s = setup(insurance_claims, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold, # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method, transformation=p_transformation, normalize=p_normalize, pca=p_pca, pca_method=p_pca_method) exp_anomaly = AnomalyExperiment() # init setup on exp exp_anomaly.setup(insurance_claims, session_id = 123) with st.spinner("Analyzing..."): # train model anomaly_model = create_model(selected_model) st.header("Assign Model") anomaly_model_2 = assign_model(anomaly_model) anomaly_model_2 st.header("Anomaly Metrics") anomaly_results = pull() anomaly_results if graph_select: # plot st.header("Anomaly Plots") plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit') plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit') if feat_imp_select: # Create a Classification Model to extract feature importance st.header("Feature Importance") from pycaret.classification import setup, create_model, get_config s = setup(anomaly_model_2, target = 'Anomaly') lr = create_model('lr') # this is how you can recreate the table feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False) # sort by feature importance value and filter top 10 feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10) # Display the filtered table in Streamlit # st.dataframe(feat_imp) # Display the filtered table as a bar chart in Streamlit st.bar_chart(feat_imp.set_index('Feature')) try: main() except Exception as e: st.sidebar.error(f"An error occurred: {e}")