Manoj commited on
Commit
a9415a6
1 Parent(s): 87ec425
Home.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ import base64
5
+ from pathlib import Path
6
+
7
+ path = os.path.dirname(__file__)
8
+ file_ = open(f"{path}/logo.png", "rb")
9
+ contents = file_.read()
10
+ data_url = base64.b64encode(contents).decode("utf-8")
11
+ file_.close()
12
+
13
+ def load_local_css(file_name):
14
+ with open(file_name) as f:
15
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
16
+
17
+ def set_header():
18
+ return st.markdown(
19
+ f"""<div class='main-header'>
20
+ <h1>Synthetic Control</h1>
21
+ <img src="data:image;base64,{data_url}", alt="Logo">
22
+ </div>""",
23
+ unsafe_allow_html=True,
24
+ )
25
+
26
+
27
+ st.set_page_config(layout="wide")
28
+ load_local_css("styles.css")
29
+ set_header()
30
+
31
+ st.title("Input data")
32
+
33
+ data_file = st.file_uploader(
34
+ label="Choose a file",
35
+ accept_multiple_files=False,
36
+ key="user_upload_file",
37
+ type=["csv", "xlsx"]
38
+ )
39
+
40
+ info_placeholder = st.empty()
41
+
42
+ if data_file:
43
+ #df = pd.read_csv(data_file,dtype = {'individual_id_ov':str})
44
+ dtype={'individual_id_ov':'str',
45
+ 'past_3month_GMV_GMA':'float64',
46
+ 'past_3month_qty_GMA':'int64',
47
+ 'past_3month_orders_GMA':'int64',
48
+ 'past_6month_GMV_GMA':'float64',
49
+ 'past_6month_qty_GMA':'int64',
50
+ 'past_6month_orders_GMA':'int64',
51
+ 'past_9month_GMV_GMA':'float64',
52
+ 'past_9month_qty_GMA':'int64',
53
+ 'past_9month_orders_GMA':'int64',
54
+ 'past_12month_GMV_GMA':'float64',
55
+ 'past_12month_qty_GMA':'int64',
56
+ 'past_12month_orders_GMA':'int64',
57
+ 'avg_order_gap_between_GMA_purchases':'float64',
58
+ 'days_since_last_GMA_purchase':'float64',
59
+ 'age':'float64',
60
+ 'gender':'str',
61
+ 'income_group':'str',
62
+ 'age_group':'str',
63
+ 'urbanicity':'str',
64
+ 'ethnicity':'str',
65
+ 'Kids':'str',
66
+ 'hh_size_excl_child':'str',
67
+ 'hh_adult_qty':'float64',
68
+ 'hh_scs_est_per1000_income_amt':'float64',
69
+ 'avg_order_gap_between_WMT_purchases':'float64',
70
+ 'days_since_last_WMT_purchase':'float64',
71
+ 'Y':'int64'}
72
+ df = pd.read_excel(data_file, sheet_name='sheet1', dtype=dtype,engine='openpyxl')
73
+ st.session_state.df = df
74
+ st.write(df.head())
75
+ with info_placeholder:
76
+ st.success("File upload successful")
77
+
78
+ plot_df=pd.read_excel(data_file, sheet_name='sheet2')
79
+ st.session_state.plot_df = plot_df
80
+ # start_date = st.date_input("Start date")
81
+ # end_date = st.date_input("End date")
82
+
83
+ # # Show the selected date range
84
+ # st.write("Selected date range:", start_date, "to", end_date)
85
+
86
+ # uploaded_file = st.file_uploader("Choose a file")
87
+
88
+ # if uploaded_file is not None:
89
+ # df=pd.read_csv(uploaded_file,dtype = {'individual_id_ov':str})
90
+ # st.session_state.df = df
91
+ # st.success("File upload successful, here is the data preview")
92
+ # st.write(df.head())
93
+
94
+
95
+
96
+
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: indigo
5
  colorTo: yellow
6
  sdk: streamlit
7
  sdk_version: 1.40.0
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
 
5
  colorTo: yellow
6
  sdk: streamlit
7
  sdk_version: 1.40.0
8
+ app_file: Home.py
9
  pinned: false
10
  ---
11
 
logo.png ADDED
pages/pages/1_Imputations.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##### SAFE IMPUTATION #####
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from scipy import stats
6
+ import warnings
7
+ import streamlit as st
8
+ import base64
9
+
10
+ def outlier_per_col(df,col):
11
+ q1 = df[col].quantile(0.25)
12
+ q3 = df[col].quantile(0.75)
13
+ iqr = q3 - q1
14
+
15
+ # Kolmogorov-Smirnov test to find the distribution of the data
16
+ dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
17
+
18
+ # if p > 0.05 then the data is normally distributed
19
+ # if p <= 0.05 then the data is not normally is distributed
20
+ if p <= 0.05:
21
+ lower_bound = q1 - 1.5 * iqr
22
+ upper_bound = q3 + 1.5 * iqr
23
+ outlier_df = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
24
+ outlier_per = (len(outlier_df) / len(df[col])) * 100
25
+ else:
26
+ z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
27
+ outlier_df = df[(z_score > 3)]
28
+ outlier_per = len(outlier_df) / len(df[col]) * 100
29
+ return outlier_per
30
+ def summary_stats(df,per_to_drop):
31
+ summary_df = df.isna().sum().reset_index().rename(columns={'index': 'variable', 0: 'null'})
32
+ summary_df['%null'] = (100 * summary_df['null'] / len(df)).round(2)
33
+ summary_df = summary_df.merge(df.dtypes.reset_index().rename(columns={'index': 'variable', 0: 'type'}), on='variable')
34
+ summary_df = summary_df.drop(columns=['null'])
35
+ summary_df = summary_df.drop(summary_df[summary_df['%null'] > per_to_drop].index)
36
+ df_numeric = df.select_dtypes(exclude='object')
37
+ df_categorical = df.select_dtypes(include='object')
38
+ if not df_numeric.empty:
39
+ with warnings.catch_warnings():
40
+ warnings.simplefilter("ignore")
41
+ summary_df['outlier%'] = summary_df[summary_df['variable'].isin(df_numeric.columns)].apply(lambda x: outlier_per_col(df_numeric, x['variable']), axis=1)
42
+ else:
43
+ summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'outlier%': []})])
44
+ summary_df = summary_df.merge((df.select_dtypes(exclude=['object']).nunique() / df.select_dtypes(exclude=['object']).count() * 100).reset_index().rename(columns={'index': 'variable', 0: 'unique%'}).round(2), on='variable', how='left').round(2)
45
+ summary_df = summary_df.merge(df.mean(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'mean'}).round(2), on='variable', how='left')
46
+ summary_df = summary_df.merge(df.std(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'standard deviation'}).round(2), on='variable', how='left')
47
+ summary_df = (summary_df.merge(df.var(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'variance'}), on='variable', how='left').assign(variance=lambda x: x['variance'].apply(lambda y: "{:.2f}".format(y))))
48
+ summary_df = summary_df.merge(df.skew(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'skewness'}).round(2), on='variable', how='left')
49
+ summary_df = summary_df.merge(df.kurt(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'kurtosis'}).round(2), on='variable', how='left')
50
+ summary_df = summary_df.merge(df.min(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'min'}), on='variable', how='left')
51
+ summary_df = summary_df.merge(df.max(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'max'}), on='variable', how='left')
52
+ summary_df['range'] = summary_df['max'] - summary_df['min']
53
+ if not df_numeric.empty:
54
+ summary_df = summary_df.merge((df.describe().loc['75%'].T - df.describe().loc['25%'].T).reset_index().rename(columns={'index': 'variable', 0: 'iqr'}), on='variable', how='left')
55
+ else:
56
+ summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'iqr': []})])
57
+ summary_df = summary_df.merge(df.median(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'median'}), on='variable', how='left')
58
+ if not df_categorical.empty:
59
+ summary_df = summary_df.merge(df.select_dtypes(include=['object']).mode().iloc[0].reset_index().rename(columns={'index': 'variable', 0: 'mode'}), on='variable', how='left')
60
+ summary_df = summary_df.merge(df.select_dtypes(include=['object']).nunique().reset_index().rename(columns={'index': 'variable', 0: 'distinct count'}), on='variable', how='left')
61
+ else:
62
+ summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'mode': []})])
63
+ summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'distinct count': []})])
64
+ return summary_df
65
+
66
+
67
+ def mean_imputation(df, col):
68
+ df[col].fillna(round(df[col].mean(), 2), inplace=True)
69
+
70
+ def median_imputation(df, col):
71
+ median = df[col].median()
72
+ df[col].fillna(round(median, 2), inplace=True)
73
+
74
+ def drop_rows(df, col):
75
+ df.dropna(subset=[col], inplace=True)
76
+
77
+ def drop_column(df, col):
78
+ df.drop(col, axis=1, inplace=True)
79
+
80
+ def mode_imputation(df, col):
81
+ mode = df[col].mode()[0]
82
+ df[col].fillna(mode, inplace=True)
83
+
84
+ def arbitrary_val(df, col, val):
85
+ df[col].fillna(val, inplace=True)
86
+
87
+ def linear_interpolate(df, col):
88
+ df[col].interpolate(method='linear', inplace=True)
89
+
90
+ def polynomial_interpolate(df, col):
91
+ df[col].interpolate(method='polynomial', order=2, inplace=True)
92
+
93
+ def interpolate_padding_forward(df, col):
94
+ df[col].fillna(method='ffill', inplace=True)
95
+
96
+ def interpolate_padding_backward(df, col):
97
+ df[col].fillna(method='bfill', inplace=True)
98
+
99
+ def fill_0(df, col):
100
+ df[col].fillna(0, inplace=True)
101
+
102
+ def remove_outliers(df, col):
103
+ dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
104
+ if p <= 0.05:
105
+ q1 = df[col].quantile(0.25)
106
+ q3 = df[col].quantile(0.75)
107
+ iqr = q3 - q1
108
+ lower_bound = q1 - 1.5 * iqr
109
+ upper_bound = q3 + 1.5 * iqr
110
+ df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
111
+ else:
112
+ z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
113
+ df = df[(z_score < 3)]
114
+ return df
115
+ def mean_outlier(df, col):
116
+ dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
117
+ if p <= 0.05:
118
+ q1 = df[col].quantile(0.25)
119
+ q3 = df[col].quantile(0.75)
120
+ iqr = q3 - q1
121
+ lower_bound = q1 - 1.5 * iqr
122
+ upper_bound = q3 + 1.5 * iqr
123
+ df[col][df[col] < lower_bound] = df[col].mean()
124
+ df[col][df[col] > upper_bound] = df[col].mean()
125
+ else:
126
+ z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
127
+ df.loc[z_score > 3, col] = df[col].mean()
128
+ return df
129
+
130
+ def median_outlier(df, col):
131
+ dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
132
+ if p <= 0.05:
133
+ q1 = df[col].quantile(0.25)
134
+ q3 = df[col].quantile(0.75)
135
+ iqr = q3 - q1
136
+ lower_bound = q1 - 1.5 * iqr
137
+ upper_bound = q3 + 1.5 * iqr
138
+ df[col][df[col] < lower_bound] = df[col].median()
139
+ df[col][df[col] > upper_bound] = df[col].median()
140
+ else:
141
+ z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
142
+ df.loc[z_score > 3, col] = df[col].median()
143
+ return df
144
+
145
+ def outlier_capping(df, col):
146
+ dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
147
+ if p <= 0.05:
148
+ q1 = df[col].quantile(0.25)
149
+ q3 = df[col].quantile(0.75)
150
+ iqr = q3-q1
151
+ lower_bound = q1-1.5*iqr
152
+ upper_bound = q1+1.5*iqr
153
+ df[col] = np.where(df[col] >= upper_bound, upper_bound, np.where(df[col] <= lower_bound, lower_bound, df[col]))
154
+ else:
155
+ upper_limit = df[col].mean() + (3 * df[col].std())
156
+ lower_limit = df[col].mean() - (3 * df[col].std())
157
+ df[col] = np.where(df[col] >= upper_limit, upper_limit, np.where(df[col] <= lower_limit, lower_limit, df[col]))
158
+ return df
159
+
160
+ def perform_treatment_missing(df, col, treatments):
161
+ if treatments == 'mean':
162
+ mean_imputation(df, col)
163
+ elif treatments == 'median':
164
+ median_imputation(df, col)
165
+ elif treatments == 'drop row':
166
+ drop_rows(df, col)
167
+ elif treatments == 'drop column':
168
+ drop_column(df, col)
169
+ elif treatments == 'linear interpolation':
170
+ linear_interpolate(df, col)
171
+ elif treatments == 'polynomial interpolation':
172
+ polynomial_interpolate(df, col)
173
+ elif treatments == 'ffill':
174
+ interpolate_padding_forward(df, col)
175
+ elif treatments == 'bfill':
176
+ interpolate_padding_backward(df, col)
177
+ elif treatments == 'mode':
178
+ mode_imputation(df, col)
179
+ elif treatments == 'fill_0':
180
+ fill_0(df, col)
181
+ else:
182
+ return df[col]
183
+
184
+ def perform_treatment_outlier(df, col, treatments):
185
+ if treatments == 'remove':
186
+ remove_outliers(df,col)
187
+ elif treatments == 'mean':
188
+ mean_outlier(df,col)
189
+ elif treatments == 'median':
190
+ median_imputation(df,col)
191
+ elif treatments == 'capping':
192
+ outlier_capping(df,col)
193
+ else:
194
+ return df[col]
195
+
196
+ def imputed_df(df,edited_df,identifier,flag,per_to_drop=None):
197
+ if per_to_drop is not None:
198
+ null_percentage = df.isnull().sum() / df.shape[0] * 100
199
+ col_to_drop = null_percentage[null_percentage > per_to_drop].keys()
200
+ df = df.drop(col_to_drop, axis=1)
201
+
202
+ cols_with_one_unique = df.columns[df.nunique() == 1]
203
+ df.drop(cols_with_one_unique, axis=1, inplace=True)
204
+
205
+ for col in edited_df['variable'].to_list():
206
+ perform_treatment_missing(df,col, edited_df.loc[edited_df['variable'] == col, 'Imputation method'].iloc[0])
207
+ perform_treatment_outlier(df,col, edited_df.loc[edited_df['variable'] == col, 'Outlier Treatment'].iloc[0])
208
+ return df
209
+
210
+ # flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns))
211
+ # identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns))
212
+
213
+ # numerical_columns = st.session_state.df.select_dtypes(include=['number']).columns.tolist()
214
+ # numerical_columns = [x for x in numerical_columns if x !=flag]
215
+ # categorical_columns = st.session_state.df.select_dtypes(include=['object', 'category']).columns.tolist()
216
+ # categorical_columns = [x for x in categorical_columns if x !=identifier]
217
+
218
+ # st.session_state.flag=flag
219
+ # st.session_state.identifier=identifier
220
+ st.title("Data Summary")
221
+
222
+ with st.expander("Data Inputs"):
223
+ st.subheader("Data Inputs")
224
+ ui_columns = st.columns((1, 1))
225
+ columns = set(st.session_state.df.columns)
226
+ with ui_columns[0]:
227
+ flag = st.selectbox(
228
+ label="Flag variable",
229
+ options=list(columns),
230
+ index=list(columns).index(st.session_state.flag) if 'flag' in st.session_state and st.session_state.flag is not None else 0
231
+ )
232
+ per_to_drop=st.slider(
233
+ label= "Select missing % threshold to drop columns",
234
+ key="per_to_drop",
235
+ min_value=0, max_value=100, value=st.session_state.per_to_drop if 'per_to_drop' in st.session_state else 80)
236
+
237
+ with ui_columns[-1]:
238
+ identifier = st.selectbox(
239
+ label="Identifier",
240
+ options=list(columns),
241
+ index=list(columns).index(st.session_state.identifier) if 'identifier' in st.session_state and st.session_state.identifier is not None else 0
242
+ )
243
+
244
+ # numerical_columns = st.session_state.df.select_dtypes(include=['number']).columns.tolist()
245
+ # numerical_columns = [x for x in numerical_columns if x !=flag]
246
+ # categorical_columns = st.session_state.df.select_dtypes(include=['object', 'category']).columns.tolist()
247
+ # categorical_columns = [x for x in categorical_columns if x !=identifier]
248
+ # st.session_state.numerical_columns=numerical_columns
249
+ # st.session_state.categorical_columns=categorical_columns
250
+ st.session_state.flag=flag
251
+ st.session_state.identifier=identifier
252
+
253
+ # st.subheader("Select Ordinal Columns:")
254
+ # with st.expander("Select Ordinal Columns:", expanded=True):
255
+ # select_all_checkbox = st.checkbox("Select All", key="select_all_checkbox")
256
+
257
+ # options = categorical_columns
258
+
259
+ # # Checkboxes for each column
260
+ # ordinal_columns = []
261
+ # for option in options:
262
+ # if select_all_checkbox or st.checkbox(option, key=f"checkbox_{option}"):
263
+ # ordinal_columns.append(option)
264
+ # st.session_state.ordinal_columns=list(ordinal_columns)
265
+
266
+ # nominal_columns=[x for x in categorical_columns if x not in ordinal_columns]
267
+ # st.session_state.numerical_columns=numerical_columns
268
+ # st.session_state.categorical_columns=categorical_columns
269
+ # st.session_state.ordinal_columns=ordinal_columns
270
+
271
+ #Ordinal columns order
272
+ # ordinal_col_dict = st.session_state.get("ordinal_col_dict", {})
273
+
274
+ # ordinal_col_dict = {}
275
+
276
+ # for col in ordinal_columns:
277
+ # st.subheader(f"Ordering for Unique Values in {col}")
278
+
279
+ # # Get unique values excluding NaN
280
+ # unique_values = st.session_state.df[col].dropna().unique()
281
+
282
+ # order_dict = {}
283
+
284
+ # for val in unique_values:
285
+ # order = st.number_input(f"Order for {val} in {col}", min_value=1, value=1)
286
+ # order_dict[val] = order
287
+
288
+ # ordinal_col_dict[col] = order_dict
289
+
290
+ # st.session_state.ordinal_col_dict = ordinal_col_dict
291
+
292
+ # User input for percentage threshold to drop columns
293
+ # per_to_drop = st.slider("Select Percentage Threshold to Drop Columns", min_value=0, max_value=100, value=10)
294
+ # st.session_state.per_to_drop = per_to_drop
295
+
296
+ summary_df = summary_stats(st.session_state.df, per_to_drop)
297
+ summary_df["Imputation method"]=None
298
+ summary_df["Outlier Treatment"]=None
299
+ summary_df["Imputation method"]=np.where(summary_df["type"]=='object','mode','mean')
300
+ summary_df["Outlier Treatment"]=np.where(summary_df["type"]=='object',summary_df["Outlier Treatment"],'capping')
301
+ summary_df = summary_df[~summary_df['variable'].isin([flag,identifier])]
302
+ st.session_state.summary_df=summary_df
303
+
304
+ st.subheader("Variable Summary")
305
+
306
+ IMPUTATION_OPTIONS = ["mean", "median", "linear interpolation", "polynomial interpolation", "ffill", "bfill","mode","fill_0"]
307
+ OUTLIER_OPTIONS = ["capping","remove", "mean", "median"]
308
+ NON_EDITABLE_COLUMNS = summary_df.columns.to_list()
309
+
310
+ def highlight_cols(s):
311
+ color = "#ccc"
312
+ return "background-color: %s" % color
313
+
314
+ column_config = {
315
+ "variable": st.column_config.TextColumn(disabled=True, width="medium"),
316
+ "type": st.column_config.TextColumn(disabled=True, width="medium"),
317
+ "%null": st.column_config.NumberColumn(disabled=True),
318
+ "unique%": st.column_config.NumberColumn(disabled=True),
319
+ "outlier%": st.column_config.NumberColumn(disabled=True),
320
+ "mean": st.column_config.NumberColumn(disabled=True),
321
+ "standard deviation": st.column_config.NumberColumn(disabled=True),
322
+ "variance": st.column_config.NumberColumn(disabled=True),
323
+ "skewness": st.column_config.NumberColumn(disabled=True),
324
+ "kurtosis": st.column_config.NumberColumn(disabled=True),
325
+ "min": st.column_config.NumberColumn(disabled=True),
326
+ "max": st.column_config.NumberColumn(disabled=True),
327
+ "range": st.column_config.NumberColumn(disabled=True),
328
+ "iqr": st.column_config.NumberColumn(disabled=True),
329
+ "median": st.column_config.NumberColumn(disabled=True),
330
+ "IV": st.column_config.NumberColumn(disabled=True),
331
+ "mode": st.column_config.TextColumn(disabled=True),
332
+ "distinct count": st.column_config.NumberColumn(disabled=True),
333
+ "Imputation method": st.column_config.SelectboxColumn(
334
+ options=IMPUTATION_OPTIONS, default=0
335
+ ),
336
+ "Outlier Treatment": st.column_config.SelectboxColumn(
337
+ options=OUTLIER_OPTIONS, default=0
338
+ )
339
+ }
340
+
341
+
342
+ with st.expander("Variables from the data"):
343
+ edited_df = st.data_editor(
344
+ st.session_state.summary_df
345
+ .style.hide(axis="index")
346
+ .applymap(highlight_cols, subset=NON_EDITABLE_COLUMNS),
347
+ column_config=column_config,
348
+ )
349
+ if st.button("Submit changes"):
350
+ with st.spinner("Applying imputations"):
351
+ st.divider()
352
+ edited_df = st.session_state.summary_df.copy() # Make a copy of the original DataFrame
353
+ edited_df["Imputation method"] = st.session_state.summary_df["Imputation method"] # Update the imputation method column
354
+ edited_df["Outlier Treatment"] = st.session_state.summary_df["Outlier Treatment"] # Update the outlier treatment method column
355
+
356
+ imputed_df = imputed_df(st.session_state.df, edited_df, st.session_state.identifier, st.session_state.flag, st.session_state.per_to_drop)
357
+ st.session_state.imputed_df = imputed_df
358
+ st.markdown("Imputed DataFrame")
359
+ st.dataframe(imputed_df.head(10))
360
+
361
+ # Add a download button for the imputed DataFrame
362
+ #if st.session_state.imputed_df is not None:
363
+ # csv_data = st.session_state.imputed_df.to_csv(index=False).encode()
364
+ # st.download_button(
365
+ # label="Download Imputed DataFrame as CSV",
366
+ # data=csv_data,
367
+ # file_name="imputed_data.csv",
368
+ # mime="text/csv"
369
+ # )
370
+
371
+ # Add the download button after displaying the DataFrame
372
+ #if st.dataframe:
373
+ # if st.button("Download Imputed Data"):
374
+ # imputed_csv = imputed_df.to_csv(index=False)
375
+ # b64 = base64.b64encode(imputed_csv.encode()).decode()
376
+ # href = f'<a href="data:file/csv;base64,{b64}" download="imputed_data.csv">Download Imputed Data CSV File</a>'
377
+ # st.markdown(href, unsafe_allow_html=True)
378
+
379
+ if "imputed_df" in st.session_state:
380
+ if st.button("Download Imputed Data"):
381
+ imputed_df = st.session_state.imputed_df
382
+ imputed_csv = imputed_df.to_csv(index=False)
383
+ b64 = base64.b64encode(imputed_csv.encode()).decode()
384
+ href = f'<a href="data:file/csv;base64,{b64}" download="imputed_data.csv">Download Imputed Data CSV File</a>'
385
+ st.markdown(href, unsafe_allow_html=True)
386
+
387
+
388
+
389
+
390
+ # Check if the "Submit changes" button has been clicked
391
+
392
+
393
+ # if st.button("Submit"):
394
+ # st.write("Selected Columns and Ordinal Orders:")
395
+ # st.write(ordinal_col_dict)
396
+
397
+ # # Display summary stats
398
+ # summary_df = summary_stats(st.session_state.df, per_to_drop)
399
+ # st.write("Summary Stats:")
400
+ # st.write(summary_df)
401
+
402
+ # # User input for specific column
403
+ # col_name = st.selectbox("Select a specific column name:", [None] + list(st.session_state.df.columns))
404
+
405
+ # # Display stats for the specified column
406
+ # if col_name in st.session_state.df.columns:
407
+ # st.write(f"Stats for column '{col_name}':")
408
+ # # Extract relevant information from 'summary_df' for the specific column
409
+ # col_summary = summary_df[summary_df['variable'] == col_name][['%null', 'type', 'outlier%', 'unique%', 'mean', 'standard deviation', 'variance', 'skewness', 'kurtosis', 'min', 'max', 'range', 'iqr', 'median', 'mode', 'distinct count']]
410
+ # col_summary = col_summary.T.reset_index()
411
+ # col_summary.columns = ['Stats', 'Value']
412
+ # # Display the summary statistics as a table
413
+ # st.table(col_summary)
414
+ # else:
415
+ # st.warning("Please enter a valid column name.")
pages/pages/2_Profiling.py ADDED
@@ -0,0 +1,775 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.tree import DecisionTreeClassifier
4
+ from sklearn.model_selection import GridSearchCV
5
+ import matplotlib.pyplot as plt
6
+ from tqdm import tqdm
7
+ from matplotlib.ticker import MaxNLocator
8
+ import streamlit as st
9
+ import ast
10
+ from collections import defaultdict
11
+ from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
12
+ from sklearn.cluster import KMeans, AgglomerativeClustering
13
+ from sklearn.preprocessing import LabelEncoder
14
+ #from kmodes.kmodes import KModes
15
+ import matplotlib.pyplot as plt
16
+ import seaborn as sns
17
+ #from kmodes.kprototypes import KPrototypes
18
+ import warnings
19
+ import pandas as pd
20
+ import numpy as np
21
+ from scipy import stats
22
+ import scipy.cluster.hierarchy as sch
23
+ from scipy.spatial.distance import pdist
24
+ import os
25
+ import re
26
+ import time
27
+ from plotly.subplots import make_subplots
28
+ import plotly.graph_objects as go
29
+ import numpy as np
30
+ import plotly.express as px
31
+ import base64
32
+
33
+
34
+ def tree_based_bin_data(df, column_name, dep_var, depth_of_tree):
35
+ df2 = df.copy()
36
+ df2 = df2.loc[df2[column_name].notnull()]
37
+ x = df2[column_name].values.reshape(-1, 1)
38
+ y = df2[dep_var].values
39
+ params = {'max_depth': range(2, depth_of_tree + 1), 'min_samples_split': [2, 3, 5, 10], 'min_samples_leaf': [int(np.ceil(0.05 * len(x)))]}
40
+ clf = DecisionTreeClassifier()
41
+ g_search = GridSearchCV(clf, param_grid=params, scoring='accuracy')
42
+ g_search.fit(x, y)
43
+ best_clf = g_search.best_estimator_
44
+ bin_edges = best_clf.tree_.threshold
45
+ bin_edges = sorted(set(bin_edges[bin_edges != -2]))
46
+ tree_based_binned_data = value_bin_data(df, column_name, bin_edges)
47
+ return tree_based_binned_data
48
+
49
+
50
+ def decile_bin_data(df, col, no_of_bins):
51
+ decile_binned_data = pd.qcut(df[col], no_of_bins, duplicates='drop')
52
+ return decile_binned_data
53
+
54
+
55
+ def value_bin_data(df, col, no_of_bins):
56
+ value_binned_data = pd.cut(df[col], no_of_bins, duplicates='drop')
57
+ return value_binned_data
58
+
59
+
60
+ def col_bin_summary_numerical(bin_df, col, dep_var=None):
61
+ unique_bin_edges = bin_df[col].unique()
62
+ df_new = pd.DataFrame({"bin_ranges": unique_bin_edges})
63
+
64
+ try:
65
+ df_new = df_new.merge((bin_df[col].value_counts() / len(bin_df) * 100).reset_index().rename(columns={'index': 'bin_ranges', col: 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2)
66
+ except:
67
+ df_new = df_new.merge((bin_df[col].value_counts() / len(bin_df) * 100).reset_index().rename(columns={col: 'bin_ranges', 'count': 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2)
68
+ if dep_var is not None:
69
+ df_new = df_new.merge(bin_df.groupby(col)[dep_var].sum().reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Event'}), on='bin_ranges', how='left')
70
+ df_new = df_new.merge(bin_df.groupby(col)[dep_var].mean().reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Mean_DV'}), on='bin_ranges', how='left')
71
+ df_new['Index'] = (100 * df_new['Mean_DV'] / bin_df['Y'].mean()).round()
72
+ df_new = df_new[['bin_ranges', 'count%', 'Event', 'Mean_DV', 'Index']]
73
+ df_new = df_new.sort_values(by='bin_ranges')
74
+
75
+ return df_new
76
+
77
+
78
+
79
+
80
+
81
+ def plot_chart(df, col, dep_var):
82
+ #fig = go.Figure()
83
+ df['bin_ranges_str'] = df['bin_ranges'].astype(str)
84
+ fig = make_subplots(specs=[[{"secondary_y": True}]])
85
+ # Bar trace for Count%
86
+
87
+ fig.add_trace(
88
+ go.Bar(
89
+ x=df['bin_ranges_str'],
90
+ y=df['count%'],
91
+ name='Count%',
92
+ marker_color='#053057',
93
+ hovertemplate=(
94
+ f"Bin: %{{x}}<br>"
95
+ f"Count%: %{{y}}"
96
+ ),
97
+ )
98
+ )
99
+
100
+ # Add the line trace for Index on the secondary y-axis
101
+ fig.add_trace(
102
+ go.Scatter(
103
+ x=df['bin_ranges_str'],
104
+ y=df['Index'],
105
+ mode='lines+markers',
106
+ name='Index',
107
+ marker=dict(color="#8ac4f8"),
108
+ hovertemplate=(
109
+ f"Bin: %{{x}}<br>"
110
+ f"Index%: %{{y}}"
111
+ ),
112
+ ),
113
+ secondary_y=True
114
+ )
115
+
116
+ # Update layout
117
+ fig.update_layout(
118
+ title=f'Distribution of {col}',
119
+ xaxis=dict(title='Bin_ranges'),
120
+ yaxis=dict(title='Count%', color='#053057'),
121
+ yaxis2=dict(title='Index', color="#8ac4f8", overlaying='y', side='right'),
122
+ legend=dict(x=1.02, y=0.98),
123
+ hovermode='x'
124
+ )
125
+
126
+ fig.update_xaxes(showgrid=False)
127
+ fig.update_yaxes(showgrid=False)
128
+
129
+ return fig
130
+
131
+ # def plot_chart(df, col, dep_var=None):
132
+ # fig, ax1 = plt.subplots(figsize=(10, 6))
133
+
134
+ # # Convert Interval type to string
135
+ # df['bin_ranges_str'] = df['bin_ranges'].astype(str)
136
+
137
+ # ax1.bar(df['bin_ranges_str'], df['count%'], color='b', alpha=0.7, label='Count%')
138
+ # ax1.set_xlabel('Bin Ranges')
139
+ # ax1.set_ylabel('Count%', color='b')
140
+
141
+ # if dep_var is not None:
142
+ # ax2 = ax1.twinx()
143
+ # ax2.plot(df['bin_ranges_str'], df['Index'], color='r', marker='o', label='Index')
144
+ # ax2.set_ylabel('Index', color='r')
145
+
146
+ # ax1.set_title(f'Distribution of {col}')
147
+ # ax1.legend(loc='upper left')
148
+
149
+ # return st.plotly_chart(fig)
150
+
151
+
152
+
153
+
154
+
155
+ def create_numerical_binned_data(df, col, func,no_of_bins=None,dep_var=None, depth=None):
156
+ df_org = df.copy()
157
+
158
+ if dep_var is not None:
159
+ df_org[dep_var] = df_org[dep_var].astype('int64')
160
+ df_num = df_org.select_dtypes(include=[np.number]).drop(dep_var, axis=1)
161
+
162
+ if func == 'tree':
163
+ bin_df = tree_based_bin_data(df, col, dep_var, depth)
164
+ elif func == 'decile':
165
+ bin_df = decile_bin_data(df_num, col, 10)
166
+ else:
167
+ bin_df = value_bin_data(df_num, col, no_of_bins)
168
+
169
+ bin_df = pd.concat([bin_df, df_org[dep_var]], axis=1)
170
+ else:
171
+ df_num = df_org.select_dtypes(include=[np.number])
172
+
173
+ if func == 'decile':
174
+ bin_df = decile_bin_data(df_num, col, no_of_bins)
175
+ else:
176
+ bin_df = value_bin_data(df_num, col, no_of_bins)
177
+
178
+ df_summary = col_bin_summary_numerical(bin_df,col, dep_var)
179
+
180
+ return df_summary
181
+
182
+
183
+ def create_numerical_binned_data1(df, col, func,no_of_bins,dep_var,depth=None):
184
+ df_org = df.copy()
185
+
186
+ df_org[dep_var] = df_org[dep_var].astype('int64')
187
+ df_num = df_org.select_dtypes(include=[np.number]).drop(dep_var, axis=1)
188
+
189
+ if func == 'tree':
190
+ bin_df = tree_based_bin_data(df, col, dep_var, depth)
191
+ elif func == 'decile':
192
+ bin_df = decile_bin_data(df_num, col, no_of_bins)
193
+ else:
194
+ bin_df = value_bin_data(df_num, col, no_of_bins)
195
+
196
+ bin_df = pd.concat([bin_df, df_org[dep_var]], axis=1)
197
+
198
+ binned_data=pd.DataFrame()
199
+ binned_data[col]=df_org[col]
200
+ unique_bins = bin_df[col].unique()
201
+ for bin_value in unique_bins:
202
+ bin_column_name = f"{col}_{bin_value}"
203
+ binned_data[bin_column_name] = np.where(binned_data[col] == bin_value, df_org[col], 0)
204
+
205
+ return binned_data
206
+
207
+
208
+ #Categorical cols binning
209
+
210
+ def woe_iv(df, column_name, dep_var, no_of_bins):
211
+ y0 = df[dep_var].value_counts()[0]
212
+ y1 = df[dep_var].value_counts()[1]
213
+ if df[column_name].nunique() < 10:
214
+ data = pd.Series(pd.factorize(df[column_name])[0] + 1, index=df.index).rename('{}'.format(column_name)).apply(lambda x: f'bin{x}')
215
+ else:
216
+ df_woe_iv = (pd.crosstab(df[column_name], df[dep_var], normalize='columns').assign(woe=lambda dfx: np.log((dfx[1] + (0.5 / y1)) / (dfx[0] + (0.5 / y0)))).assign(iv=lambda dfx: (dfx['woe'] * (dfx[1] - dfx[0]))))
217
+ woe_map = df_woe_iv['woe'].to_dict()
218
+ woe_col = df[column_name].map(woe_map)
219
+ data = pd.qcut(woe_col, no_of_bins, duplicates='drop')
220
+ n = data.nunique()
221
+ labels = [f'bin{i}' for i in range(1, n + 1)]
222
+ data = data.cat.rename_categories(labels)
223
+ sizes = data.value_counts(normalize=True)
224
+ min_size = 0.05
225
+ while sizes.min() < min_size and no_of_bins > 1:
226
+ no_of_bins -= 1
227
+ data = pd.qcut(woe_col, q=no_of_bins, duplicates='drop')
228
+ if data.nunique() != data.cat.categories.nunique():
229
+ continue
230
+ n = data.nunique()
231
+ labels = [f'bin{i}' for i in range(1, n + 1)]
232
+ data = data.cat.rename_categories(labels)
233
+ sizes = data.value_counts(normalize=True)
234
+ return data
235
+
236
+ def naive_cat_bin(df, col, max_thre=10, min_thre=5, tolerence=2, flag='ignore'):
237
+ value_counts = df[col].value_counts()
238
+ total_values = len(df)
239
+ count_percentages = (value_counts / total_values) * 100
240
+ unique_values_df = pd.DataFrame({'Category': value_counts.index, 'Count Percentage': count_percentages})
241
+ count_per = list(unique_values_df['Count Percentage'])
242
+
243
+ final_ini = []
244
+ for i in count_per:
245
+ if i >= min_thre:
246
+ final_ini.append(i)
247
+ a = [x for x in count_per if x not in final_ini]
248
+
249
+ total_bins = int(100 / max_thre)
250
+ ava_bins = len(final_ini)
251
+ ava_bin_per = sum(final_ini)
252
+ bin_req = total_bins - ava_bins
253
+ bin_req_per = 100 - ava_bin_per
254
+
255
+ if flag == 'error' and bin_req > 0 and (bin_req_per / bin_req) > max_thre:
256
+ print(f"Binning for {col} is not possible with given parameters.")
257
+ return
258
+
259
+ step = False
260
+ while not step:
261
+ if bin_req > 0:
262
+ if (bin_req_per / bin_req) > min_thre:
263
+ step = True
264
+ else:
265
+ bin_req -= 1
266
+ else:
267
+ step = True
268
+
269
+ final_ini = [[x] for x in final_ini]
270
+
271
+ if bin_req > 0:
272
+ target_sum = bin_req_per / bin_req
273
+ else:
274
+ target_sum = bin_req_per
275
+ tolerence = 0
276
+
277
+ final = []
278
+ current_sum = 0.0
279
+ start_index = len(a) - 1
280
+ values = []
281
+ while start_index >= 0:
282
+ current_sum += a[start_index]
283
+ values.append(a[start_index])
284
+ if current_sum < target_sum - tolerence:
285
+ start_index -= 1
286
+ else:
287
+ final.append(values)
288
+ values = []
289
+ start_index -= 1
290
+ current_sum = 0.0
291
+ final.append(values)
292
+ final = final[::-1]
293
+ final = [sublist for sublist in final if sublist]
294
+ final_b = final_ini + final
295
+
296
+ final = [final_b[0]]
297
+ for subarr in final_b[1:]:
298
+ if sum(subarr) < (min_thre - tolerence):
299
+ final[-1].extend(subarr)
300
+ else:
301
+ final.append(subarr)
302
+
303
+ table = dict(zip(unique_values_df['Category'], unique_values_df['Count Percentage']))
304
+ new_final = [sublist.copy() for sublist in final]
305
+
306
+ table_reverse = defaultdict(list)
307
+ for k, v in table.items():
308
+ table_reverse[v].append(k)
309
+
310
+ output = []
311
+ for l in new_final:
312
+ temp = []
313
+ for item in l:
314
+ temp.append(table_reverse[item].pop())
315
+ output.append(temp)
316
+ new_final = output
317
+
318
+ k = len(new_final)
319
+ bin_labels = [f'bin{i}' for i in range(1, k + 1)]
320
+ bin_mapping = {value: bin_labels[i] for i, sublist in enumerate(new_final) for value in sublist}
321
+ bin_mapping[np.nan] = 'binNA'
322
+ return df[col].apply(lambda x: bin_mapping.get(x, x))
323
+
324
+ def col_bin_summary_categorical(df_cat, col, binned_df_1,dep_var=None):
325
+ unique_values_in_bins = df_cat.groupby(binned_df_1[col])[col].unique().apply(list)
326
+ unique_values_in_bins = unique_values_in_bins.rename_axis('bin').reset_index()
327
+ unique_bin_ranges = pd.Categorical(binned_df_1[col].unique())
328
+ uni = binned_df_1[col].nunique()
329
+ numeric_parts = [uni if val == 'binNA' else int(re.findall(r'\d+', val)[0]) for val in unique_bin_ranges]
330
+ unique_bin_ranges = unique_bin_ranges[np.argsort(numeric_parts)]
331
+ df_new_cat = pd.DataFrame({"column_name": [col] * len(unique_bin_ranges), "bin_ranges": unique_bin_ranges})
332
+ df_new_cat = df_new_cat.merge(unique_values_in_bins.rename(columns={'bin': 'bin_ranges', col: 'values in bin'}))
333
+ df_new_cat = df_new_cat.merge((binned_df_1[col].value_counts() / len(binned_df_1) * 100).reset_index().rename(columns={col: 'bin_ranges', 'count': 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2)
334
+ if dep_var is not None:
335
+ df_new_cat = df_new_cat.merge(binned_df_1.groupby(col)[dep_var].sum(numeric_only=True).reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Event'}), on='bin_ranges')
336
+ df_new_cat = df_new_cat.merge(binned_df_1.groupby(col)[dep_var].mean(numeric_only=True).reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Mean_DV'}), on='bin_ranges')
337
+ df_new_cat['Index'] = (100 * df_new_cat['Mean_DV'] / binned_df_1[dep_var].mean()).round()
338
+ return df_new_cat
339
+
340
+ def create_categorical_binned_data(imputed_df,col, categorical_binning, dep_var, no_of_bins=None, max_thre=None, min_thre=None,tolerence=2, flag='ignore'):
341
+
342
+ imputed_df[dep_var] = imputed_df[dep_var].astype('int64')
343
+ df_cat = imputed_df.select_dtypes(include=['object'])
344
+ # remove columns with only one unique values
345
+ unique_counts = df_cat.nunique()
346
+ unique_cols = unique_counts[unique_counts == 1].index.tolist()
347
+ df_cat = df_cat.drop(unique_cols, axis=1)
348
+
349
+ if categorical_binning == 'woe_iv':
350
+ df_nominal = pd.concat([imputed_df[col], imputed_df[dep_var]], axis=1)
351
+ tqdm.pandas(dynamic_ncols=True, position=0)
352
+ binned_df_nominal = df_nominal.progress_apply(lambda x: woe_iv(df_nominal, x.name, dep_var, no_of_bins))
353
+ binned_df_nominal.drop(dep_var, axis=1, inplace=True)
354
+ binned_df_nominal = binned_df_nominal.applymap(lambda x: 'NA' if pd.isnull(x) else x)
355
+ binned_df_nominal = binned_df_nominal.astype('category')
356
+
357
+ cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
358
+ binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
359
+
360
+ binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dep_var]], axis=1)
361
+ elif categorical_binning == 'naive':
362
+ df_nominal = pd.concat([imputed_df[col], imputed_df[dep_var]], axis=1)
363
+ tqdm.pandas(dynamic_ncols=True, position=0)
364
+ binned_df_nominal = df_nominal.progress_apply(lambda x: naive_cat_bin(df_nominal, x.name, 20, 5, 2, flag='ignore'))
365
+ binned_df_nominal.drop(dep_var, axis=1, inplace=True)
366
+ binned_df_nominal = binned_df_nominal.dropna(axis=1, how='all')
367
+ binned_df_nominal = binned_df_nominal.astype('category')
368
+
369
+ cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
370
+ binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
371
+
372
+ binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dep_var]], axis=1)
373
+
374
+ df_summary=col_bin_summary_categorical(df_cat, col, binned_df_nominal_1,dep_var)
375
+ return df_summary
376
+
377
+ def create_categorical_binned_data1(imputed_df,col, nominal_binning, dependant_target_variable, no_of_bins=10, max_thre=10, min_thre=5, tolerence=2, flag='ignore', min_cluster_size=0.05, max_clusters=10):
378
+
379
+ imputed_df[dependant_target_variable] = imputed_df[dependant_target_variable].astype('int64')
380
+ df_cat = imputed_df.select_dtypes(include=['object'])
381
+ # remove columns with only one unique values
382
+ unique_counts = df_cat.nunique()
383
+ unique_cols = unique_counts[unique_counts == 1].index.tolist()
384
+ df_cat = df_cat.drop(unique_cols, axis=1)
385
+
386
+ if nominal_binning == 'woe':
387
+ df_nominal = pd.concat([imputed_df[col], imputed_df[dependant_target_variable]], axis=1)
388
+ tqdm.pandas(dynamic_ncols=True, position=0)
389
+ binned_df_nominal = df_nominal.progress_apply(lambda x: woe_iv(df_nominal, x.name, dependant_target_variable, no_of_bins))
390
+ binned_df_nominal.drop(dependant_target_variable, axis=1, inplace=True)
391
+ binned_df_nominal = binned_df_nominal.applymap(lambda x: 'NA' if pd.isnull(x) else x)
392
+ binned_df_nominal = binned_df_nominal.astype('category')
393
+
394
+ cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
395
+ binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
396
+
397
+ binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dependant_target_variable]], axis=1)
398
+ elif nominal_binning == 'naive':
399
+ df_nominal = pd.concat([imputed_df[col], imputed_df[dependant_target_variable]], axis=1)
400
+ tqdm.pandas(dynamic_ncols=True, position=0)
401
+ binned_df_nominal = df_nominal.progress_apply(lambda x: naive_cat_bin(df_nominal, x.name, 20, 5, 2, flag='ignore'))
402
+ binned_df_nominal.drop(dependant_target_variable, axis=1, inplace=True)
403
+ binned_df_nominal = binned_df_nominal.dropna(axis=1, how='all')
404
+ binned_df_nominal = binned_df_nominal.astype('category')
405
+
406
+ cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
407
+ binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
408
+
409
+ binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dependant_target_variable]], axis=1)
410
+
411
+ df_summary=col_bin_summary_categorical(df_cat, col, binned_df_nominal_1,dependant_target_variable)
412
+
413
+ binned_data = pd.DataFrame()
414
+ for bin_value in df_summary['values in bin']:
415
+ bin_column_name = f"{col}_{bin_value}"
416
+ binned_data[bin_column_name] = np.where(df_cat[col].isin(bin_value), 1, 0)
417
+
418
+ return binned_data
419
+
420
+
421
+
422
+ numerical_columns = st.session_state.imputed_df.select_dtypes(include=['number']).columns.tolist()
423
+ numerical_columns = [x for x in numerical_columns if x != st.session_state.flag]
424
+ categorical_columns = st.session_state.imputed_df.select_dtypes(include=['object', 'category']).columns.tolist()
425
+ categorical_columns = [x for x in categorical_columns if x != st.session_state.identifier]
426
+ st.session_state.numerical_columns=numerical_columns
427
+ st.session_state.categorical_columns=categorical_columns
428
+
429
+
430
+ st.title("Variable Profiling")
431
+
432
+ # Retrieve stored options from session_state or use default values
433
+ function_num = st.session_state.get("function_num", "value")
434
+ depth = st.session_state.get("depth", 3)
435
+ num_bins = st.session_state.get("num_bins", 10)
436
+ function_cat = st.session_state.get("function_cat", "woe_iv")
437
+ max_slider = st.session_state.get("max_slider", 10)
438
+ min_slider = st.session_state.get("min_slider", 5)
439
+ cat_bins_iv = st.session_state.get("cat_bins_iv", 10)
440
+ cat_bins_naive = st.session_state.get("cat_bins_naive", 10)
441
+
442
+ with st.expander("Profiling Inputs"):
443
+ st.write("Binning Inputs")
444
+ ui_columns = st.columns((1, 1))
445
+ with ui_columns[0]:
446
+ function_num = st.selectbox(
447
+ label="Select Numerical Binning Function",
448
+ options=['value', 'tree'],
449
+ #index=None
450
+ index=['value', 'tree'].index(st.session_state.function_num) if 'function_num' in st.session_state and st.session_state.function_num is not None else None
451
+ )
452
+ st.session_state.function_num = function_num # Store selected option
453
+ params_num = st.empty()
454
+
455
+ with params_num:
456
+ with ui_columns[-1]:
457
+ if function_num == 'tree':
458
+ depth = st.slider(
459
+ label="Depth",
460
+ min_value=1,
461
+ max_value=10,
462
+ value=depth,
463
+ key='depth_slider')
464
+ st.session_state.depth = depth # Store selected depth
465
+ elif function_num == 'value':
466
+ num_bins = st.slider(
467
+ label="Number of Bins",
468
+ min_value=2,
469
+ max_value=20,
470
+ value=num_bins,
471
+ key='num_bins_slider_num')
472
+ st.session_state.num_bins = num_bins # Store selected number of bins
473
+ left, right = st.columns(2)
474
+
475
+ with left:
476
+ function_cat = st.selectbox(
477
+ label="Select Categorical Binning Function",
478
+ options=['woe_iv', 'naive'],
479
+ #index=None
480
+ index=['woe_iv', 'naive'].index(st.session_state.function_cat) if 'function_cat' in st.session_state and st.session_state.function_cat is not None else None
481
+ )
482
+ st.session_state.function_cat = function_cat # Store selected option
483
+ params_cat = st.empty()
484
+
485
+ with params_cat:
486
+
487
+ if function_cat == 'woe_iv':
488
+ with right:
489
+ cat_bins_iv = st.slider(
490
+ label="Number of Bins",
491
+ min_value=2,
492
+ max_value=20,
493
+ value=cat_bins_iv,
494
+ key='num_bins_slider_cat_iv')
495
+ st.session_state.cat_bins_iv = cat_bins_iv # Store selected number of bins
496
+ with left:
497
+ min_slider = st.slider(
498
+ label="Min Threshold",
499
+ min_value=1,
500
+ max_value=100,
501
+ value=min_slider,
502
+ key='min_slider')
503
+ st.session_state.min_slider = min_slider # Store selected min threshold
504
+ with right:
505
+ max_slider = st.slider(
506
+ label="Max Threshold",
507
+ min_value=1,
508
+ max_value=100,
509
+ value=max_slider,
510
+ key='max_slider')
511
+ st.session_state.max_slider = max_slider # Store selected max threshold
512
+ elif function_cat == 'naive':
513
+ with right:
514
+ cat_bins_naive = st.slider(
515
+ label="Number of Bins",
516
+ min_value=2,
517
+ max_value=20,
518
+ value=cat_bins_naive,
519
+ key='num_bins_slider_cat_naive')
520
+ st.session_state.cat_bins_naive = cat_bins_naive # Store selected number of bins
521
+
522
+ with left:
523
+ st.write("#")
524
+ perform_profiling = st.button(
525
+ label="Perform profiling"
526
+ )
527
+
528
+
529
+ # if perform_profiling:
530
+ # binned_data_num = pd.DataFrame()
531
+ # for col in st.session_state.numerical_columns:
532
+ # if function_num == 'tree':
533
+ # depth = depth
534
+ # else:
535
+ # depth=None
536
+ # if function_num == 'value':
537
+ # num_bins=num_bins
538
+ # else:
539
+ # num_bins=None
540
+ # binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth)
541
+ # binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
542
+ # binned_data_num = pd.concat([binned_data_num, binned_data_col],axis=0)
543
+ # st.markdown("binned_data_num")
544
+ # st.dataframe(binned_data_num,use_container_width=True,hide_index=True)
545
+
546
+ if perform_profiling:
547
+ with st.expander("Profiling summary"):
548
+ st.write("Numerical binned data")
549
+ binned_data_num = pd.DataFrame()
550
+ for col in st.session_state.numerical_columns:
551
+ if function_num == 'tree':
552
+ depth = depth
553
+ else:
554
+ depth=None
555
+ if function_num == 'value':
556
+ num_bins=num_bins
557
+ else:
558
+ num_bins=None
559
+ binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth)
560
+ binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
561
+ binned_data_num = pd.concat([binned_data_num, binned_data_col],axis=0)
562
+ st.dataframe(binned_data_num,use_container_width=True,hide_index=True)
563
+
564
+ st.write("Categorical binned data")
565
+ binned_data_cat = pd.DataFrame()
566
+ for col in st.session_state.categorical_columns:
567
+ if function_cat == 'woe_iv':
568
+ max_thre = max_slider
569
+ min_thre = min_slider
570
+ no_of_bins = cat_bins_iv
571
+ else:
572
+ max_thre = None
573
+ min_thre = None
574
+ no_of_bins = None
575
+ if function_cat == 'naive':
576
+ no_of_bins = cat_bins_naive
577
+ else:
578
+ no_of_bins=None
579
+ binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df,col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre,tolerence=2, flag='ignore')
580
+ binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str))
581
+ binned_data_col_cat.drop('column_name',axis=1,inplace=True)
582
+ binned_data_cat = pd.concat([binned_data_cat, binned_data_col_cat],axis=0)
583
+ st.dataframe(binned_data_cat,use_container_width=True,hide_index=True)
584
+
585
+
586
+ with st.expander("Profiling summary: Plots"):
587
+ st.markdown(
588
+ "<p class='plot-header'>Change the selected variable to plot"
589
+ " different charts</p>",
590
+ unsafe_allow_html=True,
591
+ )
592
+ left, right = st.columns(2)
593
+ with left:
594
+ if 'selected_variable' not in st.session_state:
595
+ st.session_state.selected_variable = [] # Initialize selected_variable
596
+
597
+ selected_variable = st.selectbox(
598
+ "Variable",
599
+ st.session_state.numerical_columns + st.session_state.categorical_columns,
600
+ # index=None
601
+ )
602
+ if isinstance(selected_variable, str):
603
+ selected_variable = [selected_variable] # Convert single selection to list
604
+
605
+ # Update session state with selected variable
606
+ st.session_state.selected_variable = selected_variable
607
+
608
+
609
+ # Iterate over selected variable(s)
610
+ if st.session_state.selected_variable:
611
+ for col in st.session_state.selected_variable:
612
+ if col in st.session_state.numerical_columns:
613
+ if function_num == 'tree':
614
+ depth = depth
615
+ else:
616
+ depth = None
617
+ if function_num == 'value':
618
+ num_bins = num_bins
619
+ else:
620
+ num_bins = None
621
+ binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num, num_bins, st.session_state.flag, depth)
622
+ binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
623
+ fig = plot_chart(binned_data_col, col, dep_var=None)
624
+ st.plotly_chart(fig, use_container_width=True)
625
+
626
+ elif col in st.session_state.categorical_columns:
627
+ if function_cat == 'woe_iv':
628
+ max_thre = max_slider
629
+ min_thre = min_slider
630
+ no_of_bins = cat_bins_iv
631
+ else:
632
+ max_thre = None
633
+ min_thre = None
634
+ no_of_bins = None
635
+ if function_cat == 'naive':
636
+ no_of_bins = cat_bins_naive
637
+ else:
638
+ no_of_bins = None
639
+ binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df, col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre, tolerence=2, flag='ignore')
640
+ binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str))
641
+ binned_data_col_cat.drop('column_name', axis=1, inplace=True)
642
+ fig_cat = plot_chart(binned_data_col_cat, col, dep_var=None)
643
+ st.plotly_chart(fig_cat, use_container_width=True)
644
+
645
+
646
+ st.divider()
647
+ # Combine numerical and categorical binned data into one dataframe
648
+ binned_data_combined = pd.DataFrame()
649
+
650
+ # Process numerical columns
651
+ for col in st.session_state.numerical_columns:
652
+ if function_num == 'tree':
653
+ depth = depth
654
+ else:
655
+ depth=None
656
+ if function_num == 'value':
657
+ num_bins=num_bins
658
+ else:
659
+ num_bins=None
660
+ # Your code to create numerical binned data
661
+ binned_data_num = create_numerical_binned_data1(st.session_state.imputed_df, col, function_num, num_bins, st.session_state.flag, depth)
662
+ binned_data_combined = pd.concat([binned_data_combined, binned_data_num], axis=1)
663
+
664
+ # Process categorical columns
665
+ for col in st.session_state.categorical_columns:
666
+ if function_cat == 'woe_iv':
667
+ max_thre = max_slider
668
+ min_thre = min_slider
669
+ no_of_bins = cat_bins_iv
670
+ else:
671
+ max_thre = None
672
+ min_thre = None
673
+ no_of_bins = None
674
+ if function_cat == 'naive':
675
+ no_of_bins = cat_bins_naive
676
+ else:
677
+ no_of_bins=None
678
+ # Your code to create categorical binned data
679
+ binned_data_cat = create_categorical_binned_data1(st.session_state.imputed_df, col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre, tolerence=2, flag='ignore')
680
+ binned_data_combined = pd.concat([binned_data_combined, binned_data_cat], axis=1)
681
+ def clean_column_name(column_name):
682
+ # Replace special characters with underscores except for the decimal point
683
+ return re.sub(r'\.(\d+)', '', column_name)
684
+ binned_data_combined.columns = binned_data_combined.columns.map(clean_column_name)
685
+ valid_feature_names = [name.replace('[', '').replace(']', '').replace('<', '').replace(',', '_').replace('(', '').replace("'", '') for name in binned_data_combined.columns]
686
+ valid_feature_names = [name.replace(' ', '').replace(' ', '') for name in valid_feature_names]
687
+ binned_data_combined.columns = valid_feature_names
688
+ # Display the combined binned data dataframe
689
+ st.session_state.binned_df = binned_data_combined
690
+ st.session_state.binned_df[st.session_state.flag]=st.session_state.imputed_df[st.session_state.flag]
691
+ st.session_state.binned_df.insert(0, st.session_state.identifier, st.session_state.imputed_df[st.session_state.identifier])
692
+ print(st.session_state.binned_df['individual_id_ov'])
693
+ #st.session_state.binned_df[st.session_state.identifier]=st.session_state.imputed_df[st.session_state.identifier]
694
+ st.markdown("Binned DataFrame")
695
+ st.dataframe(binned_data_combined.head(10), use_container_width=True, hide_index=True)
696
+
697
+ # Add a button to download the binned dataframe
698
+ if st.session_state.binned_df is not None:
699
+ #with st.expander("Download Binned Data"):
700
+ download_button = st.download_button(
701
+ label="Download Binned Data as CSV",
702
+ data=st.session_state.binned_df.to_csv(index=False).encode(),
703
+ file_name='binned_data.csv',
704
+ mime='text/csv',
705
+ )
706
+
707
+
708
+ # Create a button to download the DataFrame as CSV
709
+ #if st.button("Download Binned Data"):
710
+ # binned_csv = binned_df.to_csv(index=False)
711
+ # b64 = base64.b64encode(binned_csv.encode()).decode()
712
+ # href = f'<a href="data:file/csv;base64,{b64}" download="binned_data.csv">Download Binned Data CSV File</a>'
713
+ # st.markdown(href, unsafe_allow_html=True)
714
+
715
+
716
+
717
+
718
+ # def download_button(data, file_name, button_text):
719
+ # csv = data.to_csv(index=False).encode()
720
+ # href = f'<a href="data:file/csv;base64,{csv.decode()}" download="{file_name}">{button_text}</a>'
721
+ # st.markdown(href, unsafe_allow_html=True)
722
+
723
+ # # Add the download button
724
+ # download_button(binned_data_combined, 'data.csv', 'Download CSV')
725
+
726
+
727
+
728
+
729
+
730
+
731
+
732
+
733
+
734
+
735
+
736
+ # with st.expander("Profiling summary: Plots"):
737
+ # st.markdown(
738
+ # "<p class='plot-header'>Change the selected variable to plot"
739
+ # " different charts</p>",
740
+ # unsafe_allow_html=True,
741
+ # )
742
+ # st.write("Numerical binned data plots")
743
+ # for col in st.session_state.numerical_columns:
744
+ # if function_num == 'tree':
745
+ # depth = depth
746
+ # else:
747
+ # depth=None
748
+ # if function_num == 'value':
749
+ # num_bins=num_bins
750
+ # else:
751
+ # num_bins=None
752
+ # binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth)
753
+ # binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
754
+ # fig=plot_chart(binned_data_col, col, dep_var=None)
755
+ # st.plotly_chart(fig, use_container_width=False)
756
+
757
+ # st.write("Categorical binned data plots")
758
+ # for col in st.session_state.categorical_columns:
759
+ # if function_cat == 'woe_iv':
760
+ # max_thre = max_slider
761
+ # min_thre = min_slider
762
+ # no_of_bins = cat_bins_iv
763
+ # else:
764
+ # max_thre = None
765
+ # min_thre = None
766
+ # no_of_bins = None
767
+ # if function_cat == 'naive':
768
+ # no_of_bins = cat_bins_naive
769
+ # else:
770
+ # no_of_bins=None
771
+ # binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df,col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre,tolerence=2, flag='ignore')
772
+ # binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str))
773
+ # binned_data_col_cat.drop('column_name',axis=1,inplace=True)
774
+ # fig_cat = plot_chart(binned_data_col_cat, col, dep_var=None)
775
+ # st.plotly_chart(fig_cat, use_container_width=False)
pages/pages/3_Point estimates.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###### SUPER SAFE ######
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ import streamlit as st
6
+ import pandas as pd
7
+ import numpy as np
8
+ import seaborn as sn
9
+ import matplotlib.pyplot as plt
10
+ from sklearn.linear_model import LogisticRegression
11
+ from sklearn.preprocessing import MinMaxScaler, StandardScaler
12
+ from sklearn.metrics import confusion_matrix, classification_report
13
+ from sklearn.model_selection import train_test_split
14
+ import xgboost as xgb
15
+ from sklearn.linear_model import LinearRegression
16
+ from sklearn.metrics import mean_squared_error, r2_score
17
+ from sklearn.decomposition import PCA
18
+ from sklearn.preprocessing import StandardScaler
19
+ import numpy as np
20
+ import plotly.figure_factory as ff
21
+
22
+
23
+ st.set_page_config(
24
+ layout="wide",
25
+ )
26
+
27
+ def point_estimates(df, model_type, flag, identifier, control_sample_size, solver=None, max_iter=None, class_weights=None, max_depth=None, subsample=None, eta=None):
28
+ # if set(df[df[flag] == 0][identifier]).intersection(set(df[df[flag] == 1][identifier])):
29
+ # st.error("The identifier should not be common between flag values 0 and 1.")
30
+
31
+ Xs = df.drop(columns=[identifier, flag],axis=1)
32
+ X_scaled = StandardScaler().fit_transform(Xs)
33
+ n_comp = len(Xs.columns)
34
+ pca = PCA(n_components=n_comp)
35
+ pca.fit(X_scaled)
36
+ princ_comp = pca.transform(X_scaled)
37
+ PCA_DF = pd.DataFrame(princ_comp)
38
+ pca_var = pca.explained_variance_ratio_[0:n_comp].cumsum()
39
+ idx = [i for i in range(len(pca_var)) if pca_var[i] > 0.995][0]
40
+ df_pca = PCA_DF.loc[:, 0:idx]
41
+ df_pca[flag]=df[flag]
42
+ print(df_pca)
43
+ #creating train and control datasets
44
+ df_train = df_pca[df_pca[flag] == 1]
45
+ df_control = df_pca[df_pca[flag] == 0]
46
+ df_control_sample = df_control.sample(n=control_sample_size, random_state=42)
47
+ final_df_sample = pd.concat([df_train, df_control_sample], ignore_index=True)
48
+ non_req_cols=[flag]
49
+ req_cols=df_pca.columns[~df_pca.columns.isin(non_req_cols)]
50
+ # create a holdout set
51
+ identifier_df, X, y = df[[identifier]], final_df_sample[req_cols], final_df_sample[[flag]]
52
+ if model_type == 'linear':
53
+ # scale features
54
+ # min_max_scaler = MinMaxScaler()
55
+ # X_norm = min_max_scaler.fit_transform(X)
56
+ #X_norm = (X - X.min()) / (X.max() - X.min())
57
+ # fit model
58
+ model = LogisticRegression(solver=solver, max_iter=max_iter, class_weight=class_weights)
59
+ model.fit(X, y)
60
+ #feature importances
61
+ coefs = model.coef_[0]
62
+ feats = X.columns
63
+ importance_df = pd.DataFrame({'features':feats, 'coefficients':coefs})
64
+ importance_df['abs_coef'] = np.abs(importance_df['coefficients'])
65
+ elif model_type == 'xgboost':
66
+ model = xgb.XGBClassifier(max_depth=max_depth, subsample=subsample, eta=eta)
67
+ model.fit(X, y)
68
+ importance = model.feature_importances_
69
+ feats = X.columns
70
+ importance_df = pd.DataFrame({'features':feats, 'Importance':importance})
71
+
72
+ #Prediction
73
+ Y_pred = model.predict(X)
74
+ #Confusion matrix
75
+ #cm = confusion_matrix(y, Y_pred)/y.shape[0]
76
+ cm = confusion_matrix(y, Y_pred) / len(y)
77
+
78
+ # Create DataFrame for confusion matrix
79
+ classes = np.unique(y)
80
+ df_cm = pd.DataFrame(cm, index=classes, columns=classes)
81
+
82
+ # Create hover text
83
+ hover_text = [['Actual: {}<br>Predicted: {}<br>Value: {:.2f}'.format(y.iloc[i, 0], Y_pred[i], cm[i, j])
84
+ for j in range(len(classes))] for i in range(len(classes))]
85
+
86
+ # Create heatmap using Plotly with hover text
87
+ fig = ff.create_annotated_heatmap(z=df_cm.values,
88
+ x=list(classes),
89
+ y=list(classes),
90
+ colorscale='blues',
91
+ hoverinfo='text',
92
+ text=hover_text)
93
+
94
+ # Update heatmap layout
95
+ fig.update_layout(
96
+ title='Confusion Matrix',
97
+ xaxis_title='Predicted',
98
+ yaxis_title='Actual',
99
+ font=dict(size=14)
100
+ )
101
+
102
+ # Display Plotly figure in Streamlit
103
+ #st.plotly_chart(fig)
104
+ #classification report
105
+ report = classification_report(y, Y_pred, output_dict=True)
106
+ # Convert the classification report to a DataFrame
107
+ report_df = pd.DataFrame(report).transpose()
108
+ # prep data
109
+ X, y = df_pca[req_cols], df_pca[[flag]]
110
+ #X, y = df.drop(columns=[flag,identifier]), df[[flag]]
111
+ # scale features
112
+ # min_max_scaler = MinMaxScaler()
113
+ # X_norm = min_max_scaler.fit_transform(X)
114
+ #X_norm = (X - X.min()) / (X.max() - X.min())
115
+ # run inference
116
+ y_pred_proba = model.predict_proba(X)
117
+ y_pred_df = pd.DataFrame(y_pred_proba)
118
+ df_pca.insert(0, 'propensity_score', y_pred_df[1])
119
+ # df_pca[identifier] = identifier_df
120
+ # df_pca[identifier]=df_pca[identifier].astype('str')
121
+ # Display classification report
122
+ st.subheader("Classification Report")
123
+ st.dataframe(report_df,width=600)
124
+
125
+ # Display confusion matrix
126
+ # st.subheader("Confusion Matrix")
127
+ # st.write(df_cm,width=600)
128
+
129
+ # Display confusion matrix
130
+ st.subheader("Confusion matrix")
131
+ st.plotly_chart(fig)
132
+ return df_pca[['propensity_score']]
133
+
134
+
135
+
136
+ # if 'df' in st.session_state:
137
+ # task_type = st.sidebar.selectbox("Task Type", ["classification", "regression"],key="task_type")
138
+ # model_type = st.sidebar.selectbox("Model Type", ["linear", "xgboost"])
139
+ # flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns))
140
+ # identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns))
141
+ # st.sidebar.write("Applicable only for Regression model type")
142
+ # dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
143
+ # st.session_state.flag=flag
144
+ # st.session_state.identifier=identifier
145
+ # # Sidebar for user inputs
146
+ # if flag is not None:
147
+ # with st.expander("Model Configuration", expanded=True):
148
+ # unique_flag_values = st.session_state.df[flag].unique()
149
+ # for value in unique_flag_values:
150
+ # st.write(f"Y == {value}: {len(st.session_state.df[st.session_state.df[flag] == value])}")
151
+ # control_sample_size = st.text_input("Control Sample Size")
152
+
153
+ # try:
154
+ # # Try converting to an integer
155
+ # control_sample_size = int(control_sample_size)
156
+
157
+ # # Check if control_sample_size is within the valid range
158
+ # flag_0_size = len(st.session_state.df[st.session_state.df[flag] == 0])
159
+ # if control_sample_size < 0 or control_sample_size > flag_0_size:
160
+ # st.error(f"Control Sample Size must be between 0 and {flag_0_size}.")
161
+
162
+ # except ValueError:
163
+ # st.error("Please enter a valid integer for Control Sample Size.")
164
+
165
+
166
+ # #st.write("Applicable only for Regression model type")
167
+ # #if st.session_state.get("task_type","") == "regression":
168
+ # #dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
169
+ # point_estimate_variable = st.text_input("Variable of interest")
170
+ # st.session_state.point_estimate_variable=point_estimate_variable
171
+
172
+ # if st.button("Run Modeling"):
173
+ # result_df = point_estimates(st.session_state.df, task_type, model_type, point_estimate_variable, control_sample_size, flag, identifier, dep_var)
174
+
175
+ # st.session_state.modeling_df = result_df
176
+ # st.session_state.treated_df=result_df[result_df['Y']==1]
177
+ # st.session_state.non_treated_df=result_df[result_df['Y']==0]
178
+
179
+
180
+
181
+
182
+ st.title("Algorithms")
183
+
184
+ #st.subheader("Classification") # Added line
185
+ #classification_option = st.radio("Classification", ["Classification"]) # Added line
186
+
187
+ if 'classification_option' not in st.session_state:
188
+ st.session_state.classification_option = "Classification"
189
+ if 'algorithm_option' not in st.session_state:
190
+ st.session_state.algorithm_option = "Logistic Regression"
191
+
192
+ classification_option = st.radio("Algorithm Type", ["Classification", "Regression"], key="classification_option")
193
+
194
+ if classification_option != st.session_state.classification_option:
195
+ st.session_state.classification_option = classification_option
196
+
197
+ if st.session_state.classification_option == "Classification":
198
+ col1, col2 = st.columns(2)
199
+
200
+ with col1:
201
+ st.write("#####")
202
+ lr_checkbox = st.checkbox(
203
+ label="Logistic Regression",
204
+ key="algorithm_lr_cb",
205
+ value=(st.session_state.algorithm_option == "Logistic Regression")
206
+ )
207
+
208
+ with col2:
209
+ st.write("#####")
210
+ show_lr_options = st.checkbox(
211
+ label="Change default options",
212
+ key="lr_options_cb",
213
+ disabled=not lr_checkbox,
214
+ )
215
+
216
+ cols = st.columns((2, 1))
217
+ with cols[0]:
218
+ lr_hyp_placeholder = st.empty()
219
+ lr_model_placeholder = st.empty()
220
+
221
+ solver='lbfgs'
222
+ class_weights=None
223
+ max_iter=1000
224
+ if show_lr_options and lr_checkbox:
225
+ with lr_hyp_placeholder:
226
+ with st.expander("LR parameters"):
227
+ solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
228
+ max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
229
+ class_weight_option = st.selectbox(
230
+ 'Select class weights option:',
231
+ ('Custom', 'Balanced')
232
+ )
233
+
234
+ if class_weight_option == 'Custom':
235
+ weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
236
+ weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
237
+ class_weights = {1: weight_1, 0: weight_0}
238
+ elif class_weight_option == 'Balanced':
239
+ class_weights = {1: 0.5, 0: 0.5}
240
+ #control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
241
+
242
+ col1, col2 = st.columns(2)
243
+
244
+ with col1:
245
+ st.write("#####")
246
+ xgb_checkbox = st.checkbox(
247
+ label="Xgboost Classifier", key="algorithm_xgb_cb",
248
+ value=(st.session_state.algorithm_option == "Xgboost Classifier")
249
+ )
250
+
251
+ with col2:
252
+ st.write("#####")
253
+ show_xgb_options = st.checkbox(
254
+ label="Change default options",
255
+ key="xgb_options_cb",
256
+ disabled=not xgb_checkbox,
257
+ )
258
+
259
+ cols = st.columns((2, 1))
260
+ with cols[0]:
261
+ xgb_hyp_placeholder = st.empty()
262
+
263
+ max_depth=None
264
+ subsample=None
265
+ eta=None
266
+
267
+ if show_xgb_options and xgb_checkbox:
268
+ with xgb_hyp_placeholder:
269
+ with st.expander("XGB hyper parameters"):
270
+ max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
271
+ subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
272
+ eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
273
+ #control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
274
+ st.session_state.algorithm_option = "Logistic Regression" if lr_checkbox else "Xgboost Classifier"
275
+
276
+ elif classification_option == "Regression":
277
+ col1, col2 = st.columns(2)
278
+
279
+ with col1:
280
+ st.write("#####")
281
+ lr_checkbox = st.checkbox(
282
+ label="Linear Regression",
283
+ key="algorithm_lr_cb",
284
+ value=(st.session_state.algorithm_option == "Linear Regression")
285
+ )
286
+
287
+ with col2:
288
+ st.write("#####")
289
+ show_lr_options = st.checkbox(
290
+ label="Change default options",
291
+ key="lr_options_cb",
292
+ disabled=not lr_checkbox,
293
+ )
294
+
295
+ cols = st.columns((2, 1))
296
+ with cols[0]:
297
+ lr_hyp_placeholder = st.empty()
298
+ lr_model_placeholder = st.empty()
299
+
300
+ solver='lbfgs'
301
+ class_weights=None
302
+ max_iter=1000
303
+ if show_lr_options and lr_checkbox:
304
+ with lr_hyp_placeholder:
305
+ with st.expander("LR parameters"):
306
+ solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
307
+ max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
308
+ class_weight_option = st.selectbox(
309
+ 'Select class weights option:',
310
+ ('Custom', 'Balanced')
311
+ )
312
+
313
+ if class_weight_option == 'Custom':
314
+ weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
315
+ weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
316
+ class_weights = {1: weight_1, 0: weight_0}
317
+ elif class_weight_option == 'Balanced':
318
+ class_weights = {1: 0.5, 0: 0.5}
319
+
320
+ col1, col2 = st.columns(2)
321
+
322
+ with col1:
323
+ st.write("#####")
324
+ xgb_checkbox = st.checkbox(
325
+ label="Xgboost Regression", key="algorithm_xgb_cb",
326
+ value=(st.session_state.algorithm_option == "Xgboost Regression")
327
+ )
328
+
329
+ with col2:
330
+ st.write("#####")
331
+ show_xgb_options = st.checkbox(
332
+ label="Change default options",
333
+ key="xgb_options_cb",
334
+ disabled=not xgb_checkbox,
335
+ )
336
+
337
+ cols = st.columns((2, 1))
338
+ with cols[0]:
339
+ xgb_hyp_placeholder = st.empty()
340
+
341
+ max_depth=None
342
+ subsample=None
343
+ eta=None
344
+
345
+ if show_xgb_options and xgb_checkbox:
346
+ with xgb_hyp_placeholder:
347
+ with st.expander("XGB hyper parameters"):
348
+ max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
349
+ subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
350
+ eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
351
+ st.session_state.algorithm_option = "Linear Regression" if lr_checkbox else "Xgboost Regression"
352
+
353
+ with cols[0]:
354
+ control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
355
+
356
+ #st.subheader("Classification") # Added line
357
+ #classification_option = st.radio("Classification", ["Classification"]) # Added line
358
+
359
+ if st.button("Run Modeling"):
360
+ if lr_checkbox:
361
+ st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='linear',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,solver=solver,max_iter=max_iter,class_weights=class_weights)
362
+ elif xgb_checkbox:
363
+ st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='xgboost',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,max_depth=max_depth, subsample=subsample, eta=eta)
364
+
365
+
366
+ # st.session_state.binned_df['propensity_score'] = result_df['propensity_score']
367
+ st.session_state.treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 1]
368
+ st.session_state.non_treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 0]
369
+
pages/pages/4_Matching & Diagnostics.py ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.neighbors import NearestNeighbors
5
+ from sklearn.preprocessing import StandardScaler
6
+ import xgboost as xgb
7
+ import base64
8
+ import streamlit as st
9
+ import pandas as pd
10
+ import numpy as np
11
+ import matplotlib.pyplot as plt
12
+ from sklearn.preprocessing import StandardScaler
13
+ from sklearn.neighbors import NearestNeighbors
14
+ from math import sqrt
15
+ from statistics import mean, variance
16
+ import seaborn as sns
17
+
18
+ import plotly.graph_objects as go
19
+
20
+ def cohend_plot_function(std_mean_diff_df2, std_mean_diff_df, selected_attributes):
21
+ # Create subplot of selected attributes
22
+ fig = go.Figure()
23
+
24
+ x = std_mean_diff_df2[std_mean_diff_df2["Metrics"].isin(selected_attributes)]["Cohend Value"][::-1]
25
+ y = list(std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Metrics"][::-1])
26
+
27
+ x1 = std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Cohend Value"][::-1]
28
+ y1 = list(std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Metrics"][::-1])
29
+
30
+ # Add traces
31
+ fig.add_trace(go.Scatter(
32
+ x=x,
33
+ y=y,
34
+ mode='markers',
35
+ marker=dict(color='blue'),
36
+ name='general_control_cohend'
37
+ ))
38
+
39
+ fig.add_trace(go.Scatter(
40
+ x=x1,
41
+ y=y1,
42
+ mode='markers',
43
+ marker=dict(color='orange', symbol='diamond-open'),
44
+ name='synthetic_control_cohend'
45
+ ))
46
+
47
+ # Add vertical lines
48
+ for val in [-0.1, 0.1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75]:
49
+ fig.add_shape(
50
+ type="line",
51
+ x0=val,
52
+ y0=0,
53
+ x1=val,
54
+ y1=10,
55
+ line=dict(
56
+ color="gray",
57
+ width=1,
58
+ dash="dash",
59
+ )
60
+ )
61
+
62
+ # Add vertical line at x=0
63
+ fig.add_shape(
64
+ type="line",
65
+ x0=0,
66
+ y0=0,
67
+ x1=0,
68
+ y1=10,
69
+ line=dict(
70
+ color="black",
71
+ width=1,
72
+ )
73
+ )
74
+
75
+ # Update layout
76
+ fig.update_layout(
77
+ xaxis=dict(
78
+ title='cohend',
79
+ range=[-1, 1]
80
+ ),
81
+ yaxis=dict(
82
+ title='Metrics',
83
+ autorange="reversed"
84
+ ),
85
+ legend=dict(
86
+ orientation="h",
87
+ yanchor="bottom",
88
+ y=1.02,
89
+ xanchor="right",
90
+ x=1
91
+ )
92
+ )
93
+
94
+ # Show
95
+ st.plotly_chart(fig,use_container_width=True)
96
+
97
+
98
+ def plot_comparison(comparison_df):
99
+ fig = go.Figure()
100
+
101
+ # Add bars for treatment and control values
102
+ fig.add_trace(go.Bar(
103
+ x=comparison_df.index,
104
+ y=comparison_df[comparison_df.columns[0]],
105
+ name='Treatment',
106
+ marker=dict(color='#053057'),
107
+ ))
108
+
109
+ fig.add_trace(go.Bar(
110
+ x=comparison_df.index,
111
+ y=comparison_df[comparison_df.columns[1]],
112
+ name='Control',
113
+ marker=dict(color='#8ac4f8'),
114
+ ))
115
+
116
+ # Update layout
117
+ fig.update_layout(
118
+ xaxis=dict(
119
+ title='quartiles'
120
+ ),
121
+ yaxis=dict(
122
+ title='values'
123
+ ),
124
+ barmode='group',
125
+ title=comparison_df.columns[0].split('treatment')[1][1:]
126
+ )
127
+
128
+ # Show
129
+ st.plotly_chart(fig,use_container_width=True)
130
+
131
+
132
+ def plot_propensity_distribution(treatment_data, control_data):
133
+ fig = go.Figure()
134
+
135
+ # Add histograms for treatment and control data
136
+ fig.add_trace(go.Histogram(
137
+ x=treatment_data,
138
+ name='Treatment',
139
+ marker=dict(color='#053057'),
140
+ opacity=0.6
141
+ ))
142
+
143
+ fig.add_trace(go.Histogram(
144
+ x=control_data,
145
+ name='Control',
146
+ marker=dict(color='#8ac4f8'),
147
+ opacity=0.6
148
+ ))
149
+
150
+ # Update layout
151
+ fig.update_layout(
152
+ xaxis=dict(
153
+ title='propensity_score'
154
+ ),
155
+ yaxis=dict(
156
+ title='count'
157
+ ),
158
+ barmode='overlay',
159
+ title='Propensity Distribution'
160
+ )
161
+
162
+ # Show
163
+ st.plotly_chart(fig,use_container_width=True)
164
+
165
+ def comparison(df, variable):
166
+ # generates a comparison df for any given feature
167
+ treatment_values = df[df.Y==1].groupby('quartiles')[variable].mean()
168
+ control_values = df[df.Y==0].groupby('quartiles')[variable].mean()
169
+ comparison = pd.merge(treatment_values, control_values, left_index=True, right_index=True)
170
+ comparison.rename({f'{variable}_x': f'treatment_{variable}', f'{variable}_y': f'control_{variable}'}, axis=1, inplace=True)
171
+ comparison['difference'] = np.abs(comparison[f'treatment_{variable}'] - comparison[f'control_{variable}'])
172
+ comparison['percent_difference'] = np.abs((comparison[f'treatment_{variable}'] - comparison[f'control_{variable}']) / comparison[f'treatment_{variable}'])
173
+ return comparison
174
+
175
+
176
+ # Function to calculate Cohen's d for independent samples
177
+
178
+ def cohend(d1, d2):
179
+ n1, n2 = len(d1), len(d2)
180
+ s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
181
+ s = sqrt(((n1-1) * s1 + (n2-1) * s2) / (n1 + n2 - 2))
182
+ u1, u2 = mean(d1), mean(d2)
183
+ # Check if the standard deviation is zero
184
+ if s == 0:
185
+ return 0 # Return 0 when the denominator is zero
186
+ else:
187
+ return (u1 - u2) / s
188
+
189
+ # Function to calculate standardized mean differences
190
+ def std_mean_diff(group_A_df, group_B_df):
191
+ cohend_values_arr = [0] * len(group_A_df.columns)
192
+
193
+ for i in range(len(group_A_df.columns)):
194
+ cohend_values_arr[i] = cohend(group_A_df[group_A_df.columns[i]], group_B_df[group_A_df.columns[i]])
195
+
196
+ cohend_array_pre_transp = [group_A_df.columns, cohend_values_arr]
197
+ np_array = np.array(cohend_array_pre_transp)
198
+ cohend_array = np.transpose(np_array)
199
+
200
+ return cohend_array
201
+
202
+ # Function to get matched IDs and calculate Cohen's d values
203
+ def cohend_code_function(binned_df, matching_df):
204
+ treat_df_complete = binned_df[binned_df['Y'] == 1]
205
+ control_df_complete = binned_df[binned_df['Y'] == 0]
206
+ treat_df_complete.drop('Y', axis =1, inplace = True)
207
+ control_df_complete.drop('Y', axis =1, inplace = True)
208
+ treatment_cust = pd.DataFrame()
209
+ control_cust = pd.DataFrame()
210
+ treatment_cust['individual_id_ov'] = matching_df["Id"]
211
+ control_cust['individual_id_ov'] = matching_df["matched_Id"]
212
+
213
+ #getting cohend values for synthetic control population
214
+
215
+ group_A_df = treatment_cust[['individual_id_ov']]
216
+ group_A_df = group_A_df.merge(treat_df_complete,
217
+ how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
218
+ group_B_df = control_cust[['individual_id_ov']]
219
+ group_B_df = group_B_df.merge(control_df_complete,
220
+ how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
221
+
222
+ group_A_df.drop('individual_id_ov', axis =1, inplace = True)
223
+ group_B_df.drop('individual_id_ov', axis =1, inplace = True)
224
+
225
+ cohensd_df = std_mean_diff(group_A_df, group_B_df)
226
+ std_mean_diff_df = pd.DataFrame(columns=["Metrics","Cohend Value"])
227
+ for i in range(len(cohensd_df)):
228
+ std_mean_diff_df.loc[len(std_mean_diff_df.index)] = [cohensd_df[i][0],round(float(cohensd_df[i][1]),2)]
229
+
230
+ std_mean_diff_df["flag"] = std_mean_diff_df.apply(lambda x : 1 if (x["Cohend Value"]>0.1 or x["Cohend Value"]<-0.1) else 0, axis =1)
231
+ st.write('Number of variables with standard mean difference between treatment and control is out of desired range (-0.1, 0.1): ', std_mean_diff_df["flag"].sum())
232
+
233
+
234
+ # Download cohend output table
235
+ st.write(std_mean_diff_df)
236
+
237
+ #getting cohend values for General population
238
+
239
+ group_A_df = treatment_cust[['individual_id_ov']]
240
+ group_A_df = group_A_df.merge(treat_df_complete,
241
+ how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
242
+ group_B_df = control_df_complete[['individual_id_ov']]
243
+ group_B_df = group_B_df.merge(control_df_complete,
244
+ how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
245
+
246
+ group_A_df.drop('individual_id_ov', axis =1, inplace = True)
247
+ group_B_df.drop('individual_id_ov', axis =1, inplace = True)
248
+
249
+ cohensd_df = std_mean_diff(group_A_df, group_B_df)
250
+
251
+ std_mean_diff_df2 = pd.DataFrame(columns=["Metrics","Cohend Value"])
252
+
253
+ for i in range(len(cohensd_df)):
254
+ std_mean_diff_df2.loc[len(std_mean_diff_df2.index)] = [cohensd_df[i][0],round(float(cohensd_df[i][1]),2)]
255
+
256
+ return std_mean_diff_df2, std_mean_diff_df
257
+
258
+ def calculate_iv(df, flag, identifier):
259
+ df1 = df.drop([flag, identifier, 'propensity_score'], axis=1)
260
+ iv_df = pd.DataFrame(columns=['Feature', 'IV'])
261
+ for column in df1.columns:
262
+ data = pd.concat([pd.qcut(df1[column], q=10, duplicates='drop'), df[flag]], axis=1)
263
+ groups = data.groupby(by=column)[df[flag].name].agg(['count', 'sum'])
264
+ groups['event_rate'] = groups['sum'] / groups['count']
265
+ groups['non_event_rate'] = (groups['count'] - groups['sum']) / groups['count']
266
+ groups['WOE'] = np.log(groups['event_rate'] / groups['non_event_rate'])
267
+ groups['IV'] = (groups['event_rate'] - groups['non_event_rate']) * groups['WOE']
268
+ iv = groups['IV'].sum()
269
+ iv_df = pd.concat([iv_df, pd.DataFrame({'Feature': [column], 'IV': [iv]})],axis=0, ignore_index=True)
270
+ return iv_df
271
+
272
+ def xgboost_feature_importance(df, flag,identifier):
273
+ X, y = df.drop([flag,identifier,'propensity_score'],axis=1), df[[flag]]
274
+ model = xgb.XGBClassifier()
275
+ model.fit(X, y)
276
+ importances = model.feature_importances_
277
+ importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
278
+ importance_df = importance_df.sort_values(by='Importance', ascending=False)
279
+ return importance_df
280
+
281
+ # iv_result = calculate_iv(df_features, df_target)
282
+ # importance_result = xgboost_feature_importance(df_features, df_target)
283
+
284
+
285
+ def get_matching_pairs(identifier,treated_df, non_treated_df, sample_size_A, sample_size_B,matching_columns,flag):
286
+ # if treated_df[identifier].isna().any() or non_treated_df[identifier].isna().any():
287
+ # st.error("The identifier should not contain Nan's")
288
+
289
+ treated_df = treated_df[matching_columns].sample(frac=sample_size_A/100)
290
+ non_treated_df = non_treated_df[matching_columns].sample(frac=sample_size_B/100)
291
+
292
+ treated_df = treated_df.set_index(st.session_state.identifier)
293
+ treated_df.drop(flag,axis=1,inplace=True)
294
+
295
+ non_treated_df = non_treated_df.set_index(st.session_state.identifier)
296
+ non_treated_df.drop(flag,axis=1,inplace=True)
297
+
298
+ treated_x = treated_df.values
299
+ non_treated_x = non_treated_df.values
300
+
301
+ scaler = StandardScaler()
302
+ scaler.fit(treated_x)
303
+ treated_x = scaler.transform(treated_x)
304
+ non_treated_x = scaler.transform(non_treated_x)
305
+
306
+
307
+ print("data transformaion completed")
308
+
309
+ nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(non_treated_x)
310
+
311
+ print("model fitting completed")
312
+
313
+ distances, indices = nbrs.kneighbors(treated_x)
314
+
315
+ print("matching completed")
316
+
317
+ indices = indices.reshape([1,indices.shape[0]*indices.shape[1]])
318
+
319
+ res = []
320
+ for i in list(treated_df.index):
321
+ for ele in range(1):
322
+ res.append(i)
323
+
324
+
325
+ output_df = pd.DataFrame()
326
+ output_df["Id"] = res
327
+ output_df["matched_Id"] = non_treated_df.iloc[indices[0]].index
328
+
329
+ return output_df
330
+
331
+ # Streamlit App
332
+ st.title("Matching")
333
+
334
+ # Calculate IV
335
+ iv_df = calculate_iv(st.session_state.binned_df, st.session_state.flag, st.session_state.identifier)
336
+
337
+ # Calculate XGBoost feature importance
338
+ importance_df = xgboost_feature_importance(st.session_state.binned_df, st.session_state.flag, st.session_state.identifier)
339
+
340
+ # Combine IV and feature importance into a final DataFrame
341
+ combined_df = pd.merge(iv_df, importance_df, on='Feature', suffixes=('_iv', '_importance'))
342
+ combined_df['Avg_IV_Importance'] = (combined_df['IV'] + combined_df['Importance']) / 2
343
+ combined_df.sort_values('Avg_IV_Importance',inplace=True,ascending=False)
344
+ # Add the 'Select' column with checkboxes
345
+ combined_df.insert(0, 'Select', False)
346
+ combined_df.reset_index(drop=True,inplace=True)
347
+
348
+ # Display the feature importances
349
+ st.subheader("Feature importances")
350
+ st.session_state["edited_df_combined"] = st.data_editor(
351
+ combined_df.style.hide(axis="index"),
352
+ column_config={
353
+ "Select": st.column_config.CheckboxColumn(required=True)
354
+ },
355
+ disabled=combined_df.drop("Select", axis=1).columns,use_container_width=True
356
+ )
357
+
358
+ # Allow users to enter the number of top features they want to select
359
+ top_features_input = st.number_input("Enter the number of top features", min_value=1, max_value=len(combined_df), value=None)
360
+
361
+ if top_features_input is not None:
362
+ # Select the top features based on user input
363
+ selected_df = combined_df.head(top_features_input)
364
+ selected_features = selected_df['Feature'].tolist()
365
+ else:
366
+ # Check if any features are selected via checkboxes
367
+ selected_features = st.session_state.edited_df_combined[st.session_state.edited_df_combined['Select']]['Feature'].tolist()
368
+
369
+ # Determine the selected features based on user input
370
+ #selected_features = checkbox_selected_features if checkbox_selected_features else selected_features
371
+
372
+ selected_features.append(st.session_state.identifier)
373
+ selected_features.append(st.session_state.flag)
374
+ # Update the session state with the selected features
375
+ st.session_state.selected_features = selected_features
376
+
377
+ with st.expander("Matching Inputs",expanded=True):
378
+ st.write("Matching Inputs")
379
+ ui_columns = st.columns((1, 1))
380
+ with ui_columns[0]:
381
+ sample_size_A = st.slider("Sample Size for treatment Group", 1, 100, 100)
382
+ with ui_columns[1]:
383
+ sample_size_B = st.slider("Sample Size for Control Group", 1, 100, 100)
384
+ with ui_columns[0]:
385
+ st.write("#")
386
+ run_matching = st.button(
387
+ label="Run Matching"
388
+ )
389
+ st.divider()
390
+ if run_matching:
391
+ matching_df = get_matching_pairs(st.session_state.identifier,st.session_state.treated_df, st.session_state.non_treated_df, sample_size_A, sample_size_B,st.session_state.selected_features,st.session_state.flag)
392
+ st.session_state.matching_df = matching_df
393
+ # Display the result
394
+ st.dataframe(st.session_state.matching_df)
395
+ if st.session_state.matching_df is not None:
396
+ #with st.expander("Download Matching DF"):
397
+ download_button = st.download_button(
398
+ label="Download Matched Data as CSV",
399
+ data=st.session_state.matching_df.to_csv(index=False).encode(),
400
+ file_name='matching_data.csv',
401
+ mime='text/csv',
402
+ )
403
+
404
+ # if 'matching_df' not in st.session_state:
405
+ # st.session_state.matching_df = False
406
+
407
+ st.subheader("Matching diagnostics")
408
+ control_group = st.session_state.binned_df[st.session_state.binned_df[st.session_state.identifier].isin(st.session_state.matching_df['matched_Id'])]
409
+ treatment_group = st.session_state.binned_df[st.session_state.binned_df.Y==1]
410
+
411
+ #create combined group and add ventiles
412
+ combined_group = pd.concat([control_group, treatment_group])
413
+ combined_group['quartiles'] = pd.qcut(combined_group['propensity_score'], 4, labels=False)
414
+
415
+ combined_group.drop(st.session_state.identifier,axis=1,inplace=True)
416
+ st.session_state.combined_group=combined_group
417
+
418
+ if 'perform_diagnostics' not in st.session_state:
419
+ st.session_state.perform_diagnostics = False
420
+
421
+ # Display button
422
+ perform_diagnostics = st.button(label="Run Diagnostics")
423
+
424
+ if perform_diagnostics or st.session_state.perform_diagnostics:
425
+ st.session_state.perform_diagnostics = True
426
+ with st.expander("Matching Diagnostics", expanded=True):
427
+ left, right = st.columns(2)
428
+ std_mean_diff_df2,std_mean_diff_df = cohend_code_function(st.session_state.binned_df, st.session_state.matching_df)
429
+ st.subheader("Cohen's d Plot")
430
+ cohend_plot_function(std_mean_diff_df2,std_mean_diff_df, selected_features)
431
+
432
+ # Pre-matching Propensity Distribution
433
+ st.subheader("Pre-matching Propensity Distributions")
434
+ plot_propensity_distribution(st.session_state.binned_df[st.session_state.binned_df.Y == 1]['propensity_score'], st.session_state.binned_df[st.session_state.binned_df.Y == 0]['propensity_score'])
435
+
436
+ # Post-matching Propensity Distribution
437
+ st.subheader("Post-matching Propensity Distributions")
438
+ temp = pd.merge(left=st.session_state.matching_df, right=st.session_state.binned_df[[st.session_state.identifier, 'propensity_score']], left_on='Id', right_on=st.session_state.identifier, how='left')
439
+ temp.drop(st.session_state.identifier, axis=1, inplace=True)
440
+ temp.rename({'Id': 'treatment_id', 'matched_Id': 'control_id', 'propensity_score': 'treatment_propensity'}, axis=1, inplace=True)
441
+ temp = pd.merge(left=temp, right=st.session_state.binned_df[[st.session_state.identifier, 'propensity_score']], left_on='control_id', right_on=st.session_state.identifier, how='left')
442
+ temp.drop(st.session_state.identifier, axis=1, inplace=True)
443
+ temp.rename({'propensity_score': 'control_propensity'}, axis=1, inplace=True)
444
+
445
+ plot_propensity_distribution(temp['treatment_propensity'],temp['control_propensity'])
446
+
447
+
448
+
449
+ with st.expander("Comparison Plots",expanded=True):
450
+ st.markdown(
451
+ "<p class='plot-header'>Change the selected variable to plot"
452
+ " different charts</p>",
453
+ unsafe_allow_html=True,
454
+ )
455
+ left, right = st.columns(2)
456
+ with left:
457
+ if 'selected_variable_comp' not in st.session_state:
458
+ st.session_state.selected_variable_comp = [] # Initialize selected_variable
459
+
460
+ selected_variable_comp = st.multiselect(
461
+ "Variable",
462
+ st.session_state.combined_group.columns,
463
+ st.session_state.selected_variable_comp # Set the default value to the stored session state
464
+ )
465
+
466
+ # Update session state with selected variable
467
+ st.session_state.selected_variable_comp = selected_variable_comp
468
+
469
+ if st.session_state.selected_variable_comp:
470
+ # Plot comparisons for selected variables
471
+ comparisons = {}
472
+ for var in st.session_state.selected_variable_comp:
473
+ comparisons[var] = comparison(combined_group, var)
474
+ plot_comparison(comparisons[var])
475
+
476
+
477
+ # selected_variables = st.multiselect("Select variables for comparison", combined_group.columns)
478
+ # if selected_variables:
479
+ # # Plot comparisons for selected variables
480
+ # comparisons = {}
481
+ # for var in selected_variables:
482
+ # comparisons[var] = comparison(combined_group, var)
483
+ # plot_comparison(comparisons[var])
484
+
485
+
486
+
487
+
488
+
489
+
490
+
requirements.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dash==2.9.3
2
+ dash_auth==2.0.0
3
+ dash_bootstrap_components==1.4.1
4
+ holidays==0.24
5
+ hyperopt==0.2.7
6
+ joblib==1.2.0
7
+ matplotlib==3.5.1
8
+ mdutils==1.5.0
9
+ numpy==1.22.4
10
+ openpyxl==3.0.10
11
+ openpyxl_image_loader==1.0.5
12
+ pandas==1.5.2
13
+ # Pillow==9.4.0
14
+ Pillow==10.2.0
15
+ plotly==5.14.1
16
+ pmdarima==2.0.2
17
+ prophet==1.1.2
18
+ python-dotenv==1.0.0
19
+ # pytz==2022.7.1
20
+ pytz==2022.7
21
+ scikit_learn==1.2.2
22
+ scipy==1.7.3
23
+ seaborn==0.11.2
24
+ shap==0.41.0
25
+ statsmodels==0.13.5
26
+ streamlit==1.27.2
27
+ streamlit-aggrid==0.3.4.post3
28
+ sweetviz==2.3.1
29
+ waitress==2.1.2
30
+ xgboost==1.6.2
styles.css ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ html {
2
+ margin: 0;
3
+ }
4
+
5
+ #MainMenu {
6
+
7
+ visibility: collapse;
8
+ }
9
+
10
+ footer {
11
+ visibility: collapse;
12
+ }
13
+
14
+ div.block-container{
15
+ padding: 2rem 3rem;
16
+ }
17
+
18
+
19
+ .main-header {
20
+ display: flex;
21
+ flex-direction: row;
22
+ justify-content: space-between;
23
+ align-items: center;
24
+ }
25
+ .main-header > img {
26
+ max-height: 96px;
27
+ /* max-width: 300px; */
28
+ object-fit: cover;
29
+ }
30
+
31
+
32
+
33
+ button div {
34
+ overflow: hidden;
35
+ text-overflow:ellipsis;
36
+ white-space: nowrap;
37
+ }
38
+
39
+
40
+
41
+ h1 {
42
+ color: #053057;
43
+ }
44
+
45
+ hr {
46
+ height: 10px !important;
47
+ color: #053057;
48
+ }
49
+
50
+ p.plot-header {
51
+ font-size: small;
52
+ font-weight: bold;
53
+ }
54
+
55
+ hr {
56
+ margin: 0 0 10 0;
57
+ padding: 0;
58
+ }