import os import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import matplotlib as mpl import pycaret import streamlit as st from streamlit_option_menu import option_menu import PIL from PIL import Image from PIL import ImageColor from PIL import ImageDraw from PIL import ImageFont hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) with st.sidebar: image = Image.open('itaca_logo.png') st.image(image, width=150) #,use_column_width=True) page = option_menu(menu_title='Menu', menu_icon="robot", options=["Clustering Analysis", "Anomaly Detection"], icons=["chat-dots", "key"], default_index=0 ) # Additional section below the option menu # st.markdown("---") # Add a separator line st.header("Settings") # Define the options for the dropdown list numclusters = [2, 3, 4, 5, 6] # selected_clusters = st.selectbox("Choose a number of clusters", numclusters) selected_clusters = st.slider("Choose a number of clusters", min_value=2, max_value=10, value=4) p_remove_multicollinearity = st.checkbox("Remove Multicollinearity", value=False) p_multicollinearity_threshold = st.slider("Choose multicollinearity thresholds", min_value=0.0, max_value=1.0, value=0.9) # p_remove_outliers = st.checkbox("Remove Outliers", value=False) # p_outliers_method = st.selectbox ("Choose an Outlier Method", ["iforest", "ee", "lof"]) p_transformation = st.checkbox("Choose Power Transform", value = False) p_normalize = st.checkbox("Choose Normalize", value = False) p_pca = st.checkbox("Choose PCA", value = False) p_pca_method = st.selectbox ("Choose a PCA Method", ["linear", "kernel", "incremental"]) st.title('ITACA Insurance Core AI Module') if page == "Clustering Analysis": st.header('Clustering Analysis') st.write( """ """ ) # import pycaret unsupervised models from pycaret.clustering import * # import ClusteringExperiment from pycaret.clustering import ClusteringExperiment # Display the list of CSV files directory = "./" all_files = os.listdir(directory) # Filter files to only include CSV files csv_files = [file for file in all_files if file.endswith(".csv")] # Select a CSV file from the list selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files) # Upload the CSV file uploaded_file = st.file_uploader("Choose a CSV file", type="csv") # Define the unsupervised model clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch'] selected_model = st.selectbox("Choose a clustering model", clusteringmodel) # Read and display the CSV file if selected_csv != "None" or uploaded_file is not None: if uploaded_file: try: delimiter = ',' insurance_claims = pd.read_csv (uploaded_file, sep=delimiter) except ValueError: delimiter = '|' insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1') else: insurance_claims = pd.read_csv(selected_csv) insurance_claims.describe().T cat_col = insurance_claims.select_dtypes(include=['object']).columns num_col = insurance_claims.select_dtypes(exclude=['object']).columns # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4)) # Calculate the correlation matrix corr_matrix = insurance_claims[num_col].corr() # Create a Matplotlib figure fig, ax = plt.subplots(figsize=(12, 8)) # Create a heatmap using seaborn sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax) # Set the title for the heatmap ax.set_title('Correlation Heatmap') # Display the heatmap in Streamlit st.pyplot(fig) all_columns = insurance_claims.columns.tolist() selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns) if st.button("Prediction"): insurance_claims = insurance_claims[selected_columns].copy() s = setup(insurance_claims, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold, # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method, transformation=p_transformation, normalize=p_normalize, pca=p_pca, pca_method=p_pca_method) exp_clustering = ClusteringExperiment() # init setup on exp exp_clustering.setup(insurance_claims, session_id = 123) with st.spinner("Analyzing..."): # train kmeans model cluster_model = create_model(selected_model, num_clusters = selected_clusters) cluster_model_2 = assign_model(cluster_model) # Calculate summary statistics for each cluster cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max', 'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)), ('quantile_75', lambda x: x.quantile(0.75)), 'skew']) cluster_summary cluster_model_2 # all_metrics = get_metrics() # all_metrics cluster_results = pull() cluster_results # plot pca cluster plot plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit') if selected_model != 'ap': plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit') if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'): plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit') if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'): plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit') if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'): plot_model(cluster_model, plot = 'distance', display_format = 'streamlit') if selected_model != 'ap': plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit') elif page == "Anomaly Detection": st.header('Anomaly Detection') st.write( """ """ ) # import pycaret anomaly from pycaret.anomaly import * # import AnomalyExperiment from pycaret.anomaly import AnomalyExperiment # Display the list of CSV files directory = "./" all_files = os.listdir(directory) # Filter files to only include CSV files csv_files = [file for file in all_files if file.endswith(".csv")] # Select a CSV file from the list selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files) # Upload the CSV file uploaded_file = st.file_uploader("Choose a CSV file", type="csv") # Define the unsupervised model anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos'] selected_model = st.selectbox("Choose an anomaly model", anomalymodel) # Read and display the CSV file if selected_csv != "None" or uploaded_file is not None: if uploaded_file: try: delimiter = ',' insurance_claims = pd.read_csv (uploaded_file, sep=delimiter) except ValueError: delimiter = '|' insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1') else: insurance_claims = pd.read_csv(selected_csv) all_columns = insurance_claims.columns.tolist() selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns) if st.button("Prediction"): insurance_claims = insurance_claims[selected_columns].copy() # s = setup(insurance_claims, session_id = 123) s = setup(insurance_claims, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold, # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method, transformation=p_transformation, normalize=p_normalize, pca=p_pca, pca_method=p_pca_method) exp_anomaly = AnomalyExperiment() # init setup on exp exp_anomaly.setup(insurance_claims, session_id = 123) with st.spinner("Analyzing..."): # train model anomaly_model = create_model(selected_model) anomaly_model_2 = assign_model(anomaly_model) anomaly_model_2 anomaly_results = pull() anomaly_results # plot plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit') plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit')