dperales commited on
Commit
a5daa47
1 Parent(s): b9cd793

Update app_copy.py

Browse files
Files changed (1) hide show
  1. app_copy.py +255 -126
app_copy.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import pandas as pd
3
  import numpy as np
 
4
  import matplotlib.pyplot as plt
5
  import matplotlib as mpl
6
  import pycaret
@@ -12,149 +13,277 @@ from PIL import ImageColor
12
  from PIL import ImageDraw
13
  from PIL import ImageFont
14
 
15
- hide_streamlit_style = """
16
- <style>
17
- #MainMenu {visibility: hidden;}
18
- footer {visibility: hidden;}
19
- </style>
20
- """
21
- st.markdown(hide_streamlit_style, unsafe_allow_html=True)
22
-
23
- with st.sidebar:
24
- image = Image.open('./itaca_logo.png')
25
- st.image(image,use_column_width=True)
26
- page = option_menu(menu_title='Menu',
27
- menu_icon="robot",
28
- options=["Clustering Analysis",
29
- "Anomaly Detection"],
30
- icons=["chat-dots",
31
- "key"],
32
- default_index=0
33
- )
34
-
35
- st.title('ITACA Insurance Core AI Module')
36
-
37
- if page == "Clustering Analysis":
38
- st.header('Clustering Analysis')
39
-
40
- st.write(
41
- """
42
- """
43
- )
44
-
45
- # import pycaret unsupervised models
46
- from pycaret.clustering import *
47
- # import ClusteringExperiment
48
- from pycaret.clustering import ClusteringExperiment
49
-
50
- # Upload the CSV file
51
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
52
-
53
- # Define the unsupervised model
54
- clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch']
55
- selected_model = st.selectbox("Choose a clustering model", clusteringmodel)
56
-
57
- # Define the options for the dropdown list
58
- numclusters = [2, 3, 4, 5, 6]
59
- # selected_clusters = st.selectbox("Choose a number of clusters", numclusters)
60
- selected_clusters = st.slider("Choose a number of clusters", min_value=2, max_value=10, value=4)
61
-
62
-
63
- # Read and display the CSV file
64
- if uploaded_file is not None:
65
- try:
66
- delimiter = ','
67
- insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
68
- except ValueError:
69
- delimiter = '|'
70
- insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
71
 
72
- s = setup(insurance_claims, session_id = 123, log_experiment='mlflow', experiment_name='fraud_detection')
 
 
73
 
74
- exp_clustering = ClusteringExperiment()
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- # init setup on exp
77
- exp_clustering.setup(insurance_claims, session_id = 123)
78
 
79
- if st.button("Prediction"):
80
- with st.spinner("Analyzing..."):
81
- # train kmeans model
82
- cluster_model = create_model(selected_model, num_clusters = selected_clusters)
83
 
84
- cluster_model_2 = assign_model(cluster_model)
85
- cluster_model_2
 
 
86
 
87
- all_metrics = get_metrics()
88
- all_metrics
 
 
89
 
90
- cluster_results = pull()
91
- cluster_results
 
 
 
 
 
92
 
93
- # plot pca cluster plot
94
- plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit')
95
-
96
- if selected_model != 'ap':
97
- plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit')
98
-
99
- if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'):
100
- plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit')
101
-
102
- if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'):
103
- plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit')
104
-
105
- if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'):
106
- plot_model(cluster_model, plot = 'distance', display_format = 'streamlit')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- if selected_model != 'ap':
109
- plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit')
 
 
 
 
 
110
 
111
- elif page == "Anomaly Detection":
112
- st.header('Anomaly Detection')
 
113
 
114
- st.write(
115
- """
116
- """
117
- )
 
 
 
 
 
118
 
119
- # import pycaret anomaly
120
- from pycaret.anomaly import *
121
- # import AnomalyExperiment
122
- from pycaret.anomaly import AnomalyExperiment
123
 
124
- # Upload the CSV file
125
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
126
-
127
- # Define the unsupervised model
128
- anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos']
129
- selected_model = st.selectbox("Choose an anomaly model", anomalymodel)
130
-
131
- # Read and display the CSV file
132
- if uploaded_file is not None:
133
- try:
134
- delimiter = ','
135
- insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
136
- except ValueError:
137
- delimiter = '|'
138
- insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
- s = setup(insurance_claims, session_id = 123)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- exp_anomaly = AnomalyExperiment()
 
 
 
 
 
 
 
 
 
143
 
144
- # init setup on exp
145
- exp_anomaly.setup(insurance_claims, session_id = 123)
 
 
 
 
 
146
 
147
- if st.button("Prediction"):
148
- with st.spinner("Analyzing..."):
149
- # train model
150
- anomaly_model = create_model(selected_model)
151
 
152
- anomaly_model_2 = assign_model(anomaly_model)
153
- anomaly_model_2
 
154
 
155
- anomaly_results = pull()
156
- anomaly_results
 
 
 
157
 
158
- # plot
159
- plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit')
160
- plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import pandas as pd
3
  import numpy as np
4
+ import seaborn as sns
5
  import matplotlib.pyplot as plt
6
  import matplotlib as mpl
7
  import pycaret
 
13
  from PIL import ImageDraw
14
  from PIL import ImageFont
15
 
16
+ def main():
17
+ hide_streamlit_style = """
18
+ <style>
19
+ #MainMenu {visibility: hidden;}
20
+ footer {visibility: hidden;}
21
+ </style>
22
+ """
23
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
24
+
25
+ with st.sidebar:
26
+ image = Image.open('itaca_logo.png')
27
+ st.image(image, width=150) #,use_column_width=True)
28
+ page = option_menu(menu_title='Menu',
29
+ menu_icon="robot",
30
+ options=["Clustering Analysis",
31
+ "Anomaly Detection"],
32
+ icons=["chat-dots",
33
+ "key"],
34
+ default_index=0
35
+ )
36
+
37
+ # Additional section below the option menu
38
+ # st.markdown("---") # Add a separator line
39
+ st.header("Settings")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ num_lines = st.text_input("% of lines to be processed:", value=100)
42
+ graph_select = st.checkbox("Show Graphics", value= True)
43
+ feat_imp_select = st.checkbox("Feature Importance", value= False)
44
 
45
+ # Define the options for the dropdown list
46
+ numclusters = [2, 3, 4, 5, 6]
47
+ selected_clusters = st.slider("Choose a number of clusters", min_value=2, max_value=10, value=4)
48
+
49
+ p_remove_multicollinearity = st.checkbox("Remove Multicollinearity", value=False)
50
+ p_multicollinearity_threshold = st.slider("Choose multicollinearity thresholds", min_value=0.0, max_value=1.0, value=0.9)
51
+ # p_remove_outliers = st.checkbox("Remove Outliers", value=False)
52
+ # p_outliers_method = st.selectbox ("Choose an Outlier Method", ["iforest", "ee", "lof"])
53
+ p_transformation = st.checkbox("Choose Power Transform", value = False)
54
+ p_normalize = st.checkbox("Choose Normalize", value = False)
55
+ p_pca = st.checkbox("Choose PCA", value = False)
56
+ p_pca_method = st.selectbox ("Choose a PCA Method", ["linear", "kernel", "incremental"])
57
 
58
+ st.title('ITACA Insurance Core AI Module')
 
59
 
60
+ if page == "Clustering Analysis":
61
+ st.header('Clustering Analysis')
 
 
62
 
63
+ st.write(
64
+ """
65
+ """
66
+ )
67
 
68
+ # import pycaret unsupervised models
69
+ from pycaret.clustering import setup, create_model, assign_model, pull, plot_model
70
+ # import ClusteringExperiment
71
+ from pycaret.clustering import ClusteringExperiment
72
 
73
+ # Display the list of CSV files
74
+ directory = "./"
75
+ all_files = os.listdir(directory)
76
+ # Filter files to only include CSV files
77
+ csv_files = [file for file in all_files if file.endswith(".csv")]
78
+ # Select a CSV file from the list
79
+ selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
80
 
81
+ # Upload the CSV file
82
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
83
+
84
+ # Define the unsupervised model
85
+ clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch']
86
+ selected_model = st.selectbox("Choose a clustering model", clusteringmodel)
87
+
88
+ # Read and display the CSV file
89
+ if selected_csv != "None" or uploaded_file is not None:
90
+ if uploaded_file:
91
+ try:
92
+ delimiter = ','
93
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
94
+ except ValueError:
95
+ delimiter = '|'
96
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
97
+ else:
98
+ insurance_claims = pd.read_csv(selected_csv)
99
+
100
+ num_rows = int(insurance_claims.shape[0]*int(num_lines)/100)
101
+ insurance_claims_reduced = insurance_claims.head(num_rows)
102
+ st.write("Rows to be processed: " + str(num_rows))
103
+
104
+ all_columns = insurance_claims_reduced.columns.tolist()
105
+ selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
106
+ insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
107
+
108
+ st.header("Inference Description")
109
+ insurance_claims_reduced.describe().T
110
+
111
+ cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
112
+ num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
113
+
114
+ # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
115
+ # Calculate the correlation matrix
116
+ corr_matrix = insurance_claims_reduced[num_col].corr()
117
+ # Create a Matplotlib figure
118
+ fig, ax = plt.subplots(figsize=(12, 8))
119
+ # Create a heatmap using seaborn
120
+ st.header("Heat Map")
121
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
122
+ # Set the title for the heatmap
123
+ ax.set_title('Correlation Heatmap')
124
+ # Display the heatmap in Streamlit
125
+ st.pyplot(fig)
126
+
127
+ if st.button("Prediction"):
128
+ #insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
129
 
130
+ s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
131
+ # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
132
+ transformation=p_transformation,
133
+ normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
134
+ exp_clustering = ClusteringExperiment()
135
+ # init setup on exp
136
+ exp_clustering.setup(insurance_claims_reduced, session_id = 123)
137
 
138
+ with st.spinner("Analyzing..."):
139
+ # train kmeans model
140
+ cluster_model = create_model(selected_model, num_clusters = selected_clusters)
141
 
142
+ cluster_model_2 = assign_model(cluster_model)
143
+ # Calculate summary statistics for each cluster
144
+ cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max',
145
+ 'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)),
146
+ ('quantile_75', lambda x: x.quantile(0.75)), 'skew'])
147
+ st.header("Cluster Summary")
148
+ cluster_summary
149
+ st.header("Assign Model")
150
+ cluster_model_2
151
 
152
+ # all_metrics = get_metrics()
153
+ # all_metrics
 
 
154
 
155
+ st.header("Clustering Metrics")
156
+ cluster_results = pull()
157
+ cluster_results
158
+
159
+ if graph_select:
160
+ st.header("Clustering Plots")
161
+ # plot pca cluster plot
162
+ plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit')
163
+
164
+ if selected_model != 'ap':
165
+ plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit')
166
+
167
+ if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'):
168
+ plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit')
169
+
170
+ if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'):
171
+ plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit')
172
+
173
+ if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'):
174
+ plot_model(cluster_model, plot = 'distance', display_format = 'streamlit')
175
+
176
+ if selected_model != 'ap':
177
+ plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit')
178
+
179
+ # Create a Classification Model to extract feature importance
180
+ if feat_imp_select:
181
+ st.header("Feature Importance")
182
+ from pycaret.classification import setup, create_model, get_config
183
+ s = setup(cluster_model_2, target = 'Cluster')
184
+ lr = create_model('lr')
185
+
186
+ # this is how you can recreate the table
187
+ feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
188
+ # sort by feature importance value and filter top 10
189
+ feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
190
+ # Display the filtered table in Streamlit
191
+ # st.dataframe(feat_imp)
192
+ # Display the filtered table as a bar chart in Streamlit
193
+ st.bar_chart(feat_imp.set_index('Feature'))
194
+
195
+ elif page == "Anomaly Detection":
196
+ st.header('Anomaly Detection')
197
+
198
+ st.write(
199
+ """
200
+ """
201
+ )
202
+
203
+ # import pycaret anomaly
204
+ from pycaret.anomaly import setup, create_model, assign_model, pull, plot_model
205
+ # import AnomalyExperiment
206
+ from pycaret.anomaly import AnomalyExperiment
207
+
208
+ # Display the list of CSV files
209
+ directory = "./"
210
+ all_files = os.listdir(directory)
211
+ # Filter files to only include CSV files
212
+ csv_files = [file for file in all_files if file.endswith(".csv")]
213
+ # Select a CSV file from the list
214
+ selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
215
 
216
+ # Upload the CSV file
217
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
218
+
219
+ # Define the unsupervised model
220
+ anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos']
221
+ selected_model = st.selectbox("Choose an anomaly model", anomalymodel)
222
+
223
+ # Read and display the CSV file
224
+ if selected_csv != "None" or uploaded_file is not None:
225
+ if uploaded_file:
226
+ try:
227
+ delimiter = ','
228
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
229
+ except ValueError:
230
+ delimiter = '|'
231
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
232
+ else:
233
+ insurance_claims = pd.read_csv(selected_csv)
234
+
235
+ num_rows = int(insurance_claims.shape[0]*int(num_lines)/100)
236
+ insurance_claims_reduced = insurance_claims.head(num_rows)
237
+ st.write("Rows to be processed: " + str(num_rows))
238
 
239
+ all_columns = insurance_claims_reduced.columns.tolist()
240
+ selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
241
+ insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
242
+
243
+ if st.button("Prediction"):
244
+
245
+ s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
246
+ # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
247
+ transformation=p_transformation,
248
+ normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
249
 
250
+ exp_anomaly = AnomalyExperiment()
251
+ # init setup on exp
252
+ exp_anomaly.setup(insurance_claims_reduced, session_id = 123)
253
+
254
+ with st.spinner("Analyzing..."):
255
+ # train model
256
+ anomaly_model = create_model(selected_model)
257
 
258
+ st.header("Assign Model")
259
+ anomaly_model_2 = assign_model(anomaly_model)
260
+ anomaly_model_2
 
261
 
262
+ st.header("Anomaly Metrics")
263
+ anomaly_results = pull()
264
+ anomaly_results
265
 
266
+ if graph_select:
267
+ # plot
268
+ st.header("Anomaly Plots")
269
+ plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit')
270
+ plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit')
271
 
272
+ if feat_imp_select:
273
+ # Create a Classification Model to extract feature importance
274
+ st.header("Feature Importance")
275
+ from pycaret.classification import setup, create_model, get_config
276
+ s = setup(anomaly_model_2, target = 'Anomaly')
277
+ lr = create_model('lr')
278
+ # this is how you can recreate the table
279
+ feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
280
+ # sort by feature importance value and filter top 10
281
+ feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
282
+ # Display the filtered table in Streamlit
283
+ # st.dataframe(feat_imp)
284
+ # Display the filtered table as a bar chart in Streamlit
285
+ st.bar_chart(feat_imp.set_index('Feature'))
286
+ try:
287
+ main()
288
+ except Exception as e:
289
+ st.sidebar.error(f"An error occurred: {e}")