Manoj
commited on
Commit
•
a9415a6
1
Parent(s):
87ec425
first
Browse files- Home.py +96 -0
- README.md +1 -1
- logo.png +0 -0
- pages/pages/1_Imputations.py +415 -0
- pages/pages/2_Profiling.py +775 -0
- pages/pages/3_Point estimates.py +369 -0
- pages/pages/4_Matching & Diagnostics.py +490 -0
- requirements.txt +30 -0
- styles.css +58 -0
Home.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
import base64
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
path = os.path.dirname(__file__)
|
8 |
+
file_ = open(f"{path}/logo.png", "rb")
|
9 |
+
contents = file_.read()
|
10 |
+
data_url = base64.b64encode(contents).decode("utf-8")
|
11 |
+
file_.close()
|
12 |
+
|
13 |
+
def load_local_css(file_name):
|
14 |
+
with open(file_name) as f:
|
15 |
+
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
16 |
+
|
17 |
+
def set_header():
|
18 |
+
return st.markdown(
|
19 |
+
f"""<div class='main-header'>
|
20 |
+
<h1>Synthetic Control</h1>
|
21 |
+
<img src="data:image;base64,{data_url}", alt="Logo">
|
22 |
+
</div>""",
|
23 |
+
unsafe_allow_html=True,
|
24 |
+
)
|
25 |
+
|
26 |
+
|
27 |
+
st.set_page_config(layout="wide")
|
28 |
+
load_local_css("styles.css")
|
29 |
+
set_header()
|
30 |
+
|
31 |
+
st.title("Input data")
|
32 |
+
|
33 |
+
data_file = st.file_uploader(
|
34 |
+
label="Choose a file",
|
35 |
+
accept_multiple_files=False,
|
36 |
+
key="user_upload_file",
|
37 |
+
type=["csv", "xlsx"]
|
38 |
+
)
|
39 |
+
|
40 |
+
info_placeholder = st.empty()
|
41 |
+
|
42 |
+
if data_file:
|
43 |
+
#df = pd.read_csv(data_file,dtype = {'individual_id_ov':str})
|
44 |
+
dtype={'individual_id_ov':'str',
|
45 |
+
'past_3month_GMV_GMA':'float64',
|
46 |
+
'past_3month_qty_GMA':'int64',
|
47 |
+
'past_3month_orders_GMA':'int64',
|
48 |
+
'past_6month_GMV_GMA':'float64',
|
49 |
+
'past_6month_qty_GMA':'int64',
|
50 |
+
'past_6month_orders_GMA':'int64',
|
51 |
+
'past_9month_GMV_GMA':'float64',
|
52 |
+
'past_9month_qty_GMA':'int64',
|
53 |
+
'past_9month_orders_GMA':'int64',
|
54 |
+
'past_12month_GMV_GMA':'float64',
|
55 |
+
'past_12month_qty_GMA':'int64',
|
56 |
+
'past_12month_orders_GMA':'int64',
|
57 |
+
'avg_order_gap_between_GMA_purchases':'float64',
|
58 |
+
'days_since_last_GMA_purchase':'float64',
|
59 |
+
'age':'float64',
|
60 |
+
'gender':'str',
|
61 |
+
'income_group':'str',
|
62 |
+
'age_group':'str',
|
63 |
+
'urbanicity':'str',
|
64 |
+
'ethnicity':'str',
|
65 |
+
'Kids':'str',
|
66 |
+
'hh_size_excl_child':'str',
|
67 |
+
'hh_adult_qty':'float64',
|
68 |
+
'hh_scs_est_per1000_income_amt':'float64',
|
69 |
+
'avg_order_gap_between_WMT_purchases':'float64',
|
70 |
+
'days_since_last_WMT_purchase':'float64',
|
71 |
+
'Y':'int64'}
|
72 |
+
df = pd.read_excel(data_file, sheet_name='sheet1', dtype=dtype,engine='openpyxl')
|
73 |
+
st.session_state.df = df
|
74 |
+
st.write(df.head())
|
75 |
+
with info_placeholder:
|
76 |
+
st.success("File upload successful")
|
77 |
+
|
78 |
+
plot_df=pd.read_excel(data_file, sheet_name='sheet2')
|
79 |
+
st.session_state.plot_df = plot_df
|
80 |
+
# start_date = st.date_input("Start date")
|
81 |
+
# end_date = st.date_input("End date")
|
82 |
+
|
83 |
+
# # Show the selected date range
|
84 |
+
# st.write("Selected date range:", start_date, "to", end_date)
|
85 |
+
|
86 |
+
# uploaded_file = st.file_uploader("Choose a file")
|
87 |
+
|
88 |
+
# if uploaded_file is not None:
|
89 |
+
# df=pd.read_csv(uploaded_file,dtype = {'individual_id_ov':str})
|
90 |
+
# st.session_state.df = df
|
91 |
+
# st.success("File upload successful, here is the data preview")
|
92 |
+
# st.write(df.head())
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
|
README.md
CHANGED
@@ -5,7 +5,7 @@ colorFrom: indigo
|
|
5 |
colorTo: yellow
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.40.0
|
8 |
-
app_file:
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
5 |
colorTo: yellow
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.40.0
|
8 |
+
app_file: Home.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
logo.png
ADDED
pages/pages/1_Imputations.py
ADDED
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##### SAFE IMPUTATION #####
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
from scipy import stats
|
6 |
+
import warnings
|
7 |
+
import streamlit as st
|
8 |
+
import base64
|
9 |
+
|
10 |
+
def outlier_per_col(df,col):
|
11 |
+
q1 = df[col].quantile(0.25)
|
12 |
+
q3 = df[col].quantile(0.75)
|
13 |
+
iqr = q3 - q1
|
14 |
+
|
15 |
+
# Kolmogorov-Smirnov test to find the distribution of the data
|
16 |
+
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
|
17 |
+
|
18 |
+
# if p > 0.05 then the data is normally distributed
|
19 |
+
# if p <= 0.05 then the data is not normally is distributed
|
20 |
+
if p <= 0.05:
|
21 |
+
lower_bound = q1 - 1.5 * iqr
|
22 |
+
upper_bound = q3 + 1.5 * iqr
|
23 |
+
outlier_df = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
|
24 |
+
outlier_per = (len(outlier_df) / len(df[col])) * 100
|
25 |
+
else:
|
26 |
+
z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
|
27 |
+
outlier_df = df[(z_score > 3)]
|
28 |
+
outlier_per = len(outlier_df) / len(df[col]) * 100
|
29 |
+
return outlier_per
|
30 |
+
def summary_stats(df,per_to_drop):
|
31 |
+
summary_df = df.isna().sum().reset_index().rename(columns={'index': 'variable', 0: 'null'})
|
32 |
+
summary_df['%null'] = (100 * summary_df['null'] / len(df)).round(2)
|
33 |
+
summary_df = summary_df.merge(df.dtypes.reset_index().rename(columns={'index': 'variable', 0: 'type'}), on='variable')
|
34 |
+
summary_df = summary_df.drop(columns=['null'])
|
35 |
+
summary_df = summary_df.drop(summary_df[summary_df['%null'] > per_to_drop].index)
|
36 |
+
df_numeric = df.select_dtypes(exclude='object')
|
37 |
+
df_categorical = df.select_dtypes(include='object')
|
38 |
+
if not df_numeric.empty:
|
39 |
+
with warnings.catch_warnings():
|
40 |
+
warnings.simplefilter("ignore")
|
41 |
+
summary_df['outlier%'] = summary_df[summary_df['variable'].isin(df_numeric.columns)].apply(lambda x: outlier_per_col(df_numeric, x['variable']), axis=1)
|
42 |
+
else:
|
43 |
+
summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'outlier%': []})])
|
44 |
+
summary_df = summary_df.merge((df.select_dtypes(exclude=['object']).nunique() / df.select_dtypes(exclude=['object']).count() * 100).reset_index().rename(columns={'index': 'variable', 0: 'unique%'}).round(2), on='variable', how='left').round(2)
|
45 |
+
summary_df = summary_df.merge(df.mean(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'mean'}).round(2), on='variable', how='left')
|
46 |
+
summary_df = summary_df.merge(df.std(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'standard deviation'}).round(2), on='variable', how='left')
|
47 |
+
summary_df = (summary_df.merge(df.var(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'variance'}), on='variable', how='left').assign(variance=lambda x: x['variance'].apply(lambda y: "{:.2f}".format(y))))
|
48 |
+
summary_df = summary_df.merge(df.skew(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'skewness'}).round(2), on='variable', how='left')
|
49 |
+
summary_df = summary_df.merge(df.kurt(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'kurtosis'}).round(2), on='variable', how='left')
|
50 |
+
summary_df = summary_df.merge(df.min(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'min'}), on='variable', how='left')
|
51 |
+
summary_df = summary_df.merge(df.max(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'max'}), on='variable', how='left')
|
52 |
+
summary_df['range'] = summary_df['max'] - summary_df['min']
|
53 |
+
if not df_numeric.empty:
|
54 |
+
summary_df = summary_df.merge((df.describe().loc['75%'].T - df.describe().loc['25%'].T).reset_index().rename(columns={'index': 'variable', 0: 'iqr'}), on='variable', how='left')
|
55 |
+
else:
|
56 |
+
summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'iqr': []})])
|
57 |
+
summary_df = summary_df.merge(df.median(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'median'}), on='variable', how='left')
|
58 |
+
if not df_categorical.empty:
|
59 |
+
summary_df = summary_df.merge(df.select_dtypes(include=['object']).mode().iloc[0].reset_index().rename(columns={'index': 'variable', 0: 'mode'}), on='variable', how='left')
|
60 |
+
summary_df = summary_df.merge(df.select_dtypes(include=['object']).nunique().reset_index().rename(columns={'index': 'variable', 0: 'distinct count'}), on='variable', how='left')
|
61 |
+
else:
|
62 |
+
summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'mode': []})])
|
63 |
+
summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'distinct count': []})])
|
64 |
+
return summary_df
|
65 |
+
|
66 |
+
|
67 |
+
def mean_imputation(df, col):
|
68 |
+
df[col].fillna(round(df[col].mean(), 2), inplace=True)
|
69 |
+
|
70 |
+
def median_imputation(df, col):
|
71 |
+
median = df[col].median()
|
72 |
+
df[col].fillna(round(median, 2), inplace=True)
|
73 |
+
|
74 |
+
def drop_rows(df, col):
|
75 |
+
df.dropna(subset=[col], inplace=True)
|
76 |
+
|
77 |
+
def drop_column(df, col):
|
78 |
+
df.drop(col, axis=1, inplace=True)
|
79 |
+
|
80 |
+
def mode_imputation(df, col):
|
81 |
+
mode = df[col].mode()[0]
|
82 |
+
df[col].fillna(mode, inplace=True)
|
83 |
+
|
84 |
+
def arbitrary_val(df, col, val):
|
85 |
+
df[col].fillna(val, inplace=True)
|
86 |
+
|
87 |
+
def linear_interpolate(df, col):
|
88 |
+
df[col].interpolate(method='linear', inplace=True)
|
89 |
+
|
90 |
+
def polynomial_interpolate(df, col):
|
91 |
+
df[col].interpolate(method='polynomial', order=2, inplace=True)
|
92 |
+
|
93 |
+
def interpolate_padding_forward(df, col):
|
94 |
+
df[col].fillna(method='ffill', inplace=True)
|
95 |
+
|
96 |
+
def interpolate_padding_backward(df, col):
|
97 |
+
df[col].fillna(method='bfill', inplace=True)
|
98 |
+
|
99 |
+
def fill_0(df, col):
|
100 |
+
df[col].fillna(0, inplace=True)
|
101 |
+
|
102 |
+
def remove_outliers(df, col):
|
103 |
+
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
|
104 |
+
if p <= 0.05:
|
105 |
+
q1 = df[col].quantile(0.25)
|
106 |
+
q3 = df[col].quantile(0.75)
|
107 |
+
iqr = q3 - q1
|
108 |
+
lower_bound = q1 - 1.5 * iqr
|
109 |
+
upper_bound = q3 + 1.5 * iqr
|
110 |
+
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
|
111 |
+
else:
|
112 |
+
z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
|
113 |
+
df = df[(z_score < 3)]
|
114 |
+
return df
|
115 |
+
def mean_outlier(df, col):
|
116 |
+
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
|
117 |
+
if p <= 0.05:
|
118 |
+
q1 = df[col].quantile(0.25)
|
119 |
+
q3 = df[col].quantile(0.75)
|
120 |
+
iqr = q3 - q1
|
121 |
+
lower_bound = q1 - 1.5 * iqr
|
122 |
+
upper_bound = q3 + 1.5 * iqr
|
123 |
+
df[col][df[col] < lower_bound] = df[col].mean()
|
124 |
+
df[col][df[col] > upper_bound] = df[col].mean()
|
125 |
+
else:
|
126 |
+
z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
|
127 |
+
df.loc[z_score > 3, col] = df[col].mean()
|
128 |
+
return df
|
129 |
+
|
130 |
+
def median_outlier(df, col):
|
131 |
+
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
|
132 |
+
if p <= 0.05:
|
133 |
+
q1 = df[col].quantile(0.25)
|
134 |
+
q3 = df[col].quantile(0.75)
|
135 |
+
iqr = q3 - q1
|
136 |
+
lower_bound = q1 - 1.5 * iqr
|
137 |
+
upper_bound = q3 + 1.5 * iqr
|
138 |
+
df[col][df[col] < lower_bound] = df[col].median()
|
139 |
+
df[col][df[col] > upper_bound] = df[col].median()
|
140 |
+
else:
|
141 |
+
z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
|
142 |
+
df.loc[z_score > 3, col] = df[col].median()
|
143 |
+
return df
|
144 |
+
|
145 |
+
def outlier_capping(df, col):
|
146 |
+
dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
|
147 |
+
if p <= 0.05:
|
148 |
+
q1 = df[col].quantile(0.25)
|
149 |
+
q3 = df[col].quantile(0.75)
|
150 |
+
iqr = q3-q1
|
151 |
+
lower_bound = q1-1.5*iqr
|
152 |
+
upper_bound = q1+1.5*iqr
|
153 |
+
df[col] = np.where(df[col] >= upper_bound, upper_bound, np.where(df[col] <= lower_bound, lower_bound, df[col]))
|
154 |
+
else:
|
155 |
+
upper_limit = df[col].mean() + (3 * df[col].std())
|
156 |
+
lower_limit = df[col].mean() - (3 * df[col].std())
|
157 |
+
df[col] = np.where(df[col] >= upper_limit, upper_limit, np.where(df[col] <= lower_limit, lower_limit, df[col]))
|
158 |
+
return df
|
159 |
+
|
160 |
+
def perform_treatment_missing(df, col, treatments):
|
161 |
+
if treatments == 'mean':
|
162 |
+
mean_imputation(df, col)
|
163 |
+
elif treatments == 'median':
|
164 |
+
median_imputation(df, col)
|
165 |
+
elif treatments == 'drop row':
|
166 |
+
drop_rows(df, col)
|
167 |
+
elif treatments == 'drop column':
|
168 |
+
drop_column(df, col)
|
169 |
+
elif treatments == 'linear interpolation':
|
170 |
+
linear_interpolate(df, col)
|
171 |
+
elif treatments == 'polynomial interpolation':
|
172 |
+
polynomial_interpolate(df, col)
|
173 |
+
elif treatments == 'ffill':
|
174 |
+
interpolate_padding_forward(df, col)
|
175 |
+
elif treatments == 'bfill':
|
176 |
+
interpolate_padding_backward(df, col)
|
177 |
+
elif treatments == 'mode':
|
178 |
+
mode_imputation(df, col)
|
179 |
+
elif treatments == 'fill_0':
|
180 |
+
fill_0(df, col)
|
181 |
+
else:
|
182 |
+
return df[col]
|
183 |
+
|
184 |
+
def perform_treatment_outlier(df, col, treatments):
|
185 |
+
if treatments == 'remove':
|
186 |
+
remove_outliers(df,col)
|
187 |
+
elif treatments == 'mean':
|
188 |
+
mean_outlier(df,col)
|
189 |
+
elif treatments == 'median':
|
190 |
+
median_imputation(df,col)
|
191 |
+
elif treatments == 'capping':
|
192 |
+
outlier_capping(df,col)
|
193 |
+
else:
|
194 |
+
return df[col]
|
195 |
+
|
196 |
+
def imputed_df(df,edited_df,identifier,flag,per_to_drop=None):
|
197 |
+
if per_to_drop is not None:
|
198 |
+
null_percentage = df.isnull().sum() / df.shape[0] * 100
|
199 |
+
col_to_drop = null_percentage[null_percentage > per_to_drop].keys()
|
200 |
+
df = df.drop(col_to_drop, axis=1)
|
201 |
+
|
202 |
+
cols_with_one_unique = df.columns[df.nunique() == 1]
|
203 |
+
df.drop(cols_with_one_unique, axis=1, inplace=True)
|
204 |
+
|
205 |
+
for col in edited_df['variable'].to_list():
|
206 |
+
perform_treatment_missing(df,col, edited_df.loc[edited_df['variable'] == col, 'Imputation method'].iloc[0])
|
207 |
+
perform_treatment_outlier(df,col, edited_df.loc[edited_df['variable'] == col, 'Outlier Treatment'].iloc[0])
|
208 |
+
return df
|
209 |
+
|
210 |
+
# flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns))
|
211 |
+
# identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns))
|
212 |
+
|
213 |
+
# numerical_columns = st.session_state.df.select_dtypes(include=['number']).columns.tolist()
|
214 |
+
# numerical_columns = [x for x in numerical_columns if x !=flag]
|
215 |
+
# categorical_columns = st.session_state.df.select_dtypes(include=['object', 'category']).columns.tolist()
|
216 |
+
# categorical_columns = [x for x in categorical_columns if x !=identifier]
|
217 |
+
|
218 |
+
# st.session_state.flag=flag
|
219 |
+
# st.session_state.identifier=identifier
|
220 |
+
st.title("Data Summary")
|
221 |
+
|
222 |
+
with st.expander("Data Inputs"):
|
223 |
+
st.subheader("Data Inputs")
|
224 |
+
ui_columns = st.columns((1, 1))
|
225 |
+
columns = set(st.session_state.df.columns)
|
226 |
+
with ui_columns[0]:
|
227 |
+
flag = st.selectbox(
|
228 |
+
label="Flag variable",
|
229 |
+
options=list(columns),
|
230 |
+
index=list(columns).index(st.session_state.flag) if 'flag' in st.session_state and st.session_state.flag is not None else 0
|
231 |
+
)
|
232 |
+
per_to_drop=st.slider(
|
233 |
+
label= "Select missing % threshold to drop columns",
|
234 |
+
key="per_to_drop",
|
235 |
+
min_value=0, max_value=100, value=st.session_state.per_to_drop if 'per_to_drop' in st.session_state else 80)
|
236 |
+
|
237 |
+
with ui_columns[-1]:
|
238 |
+
identifier = st.selectbox(
|
239 |
+
label="Identifier",
|
240 |
+
options=list(columns),
|
241 |
+
index=list(columns).index(st.session_state.identifier) if 'identifier' in st.session_state and st.session_state.identifier is not None else 0
|
242 |
+
)
|
243 |
+
|
244 |
+
# numerical_columns = st.session_state.df.select_dtypes(include=['number']).columns.tolist()
|
245 |
+
# numerical_columns = [x for x in numerical_columns if x !=flag]
|
246 |
+
# categorical_columns = st.session_state.df.select_dtypes(include=['object', 'category']).columns.tolist()
|
247 |
+
# categorical_columns = [x for x in categorical_columns if x !=identifier]
|
248 |
+
# st.session_state.numerical_columns=numerical_columns
|
249 |
+
# st.session_state.categorical_columns=categorical_columns
|
250 |
+
st.session_state.flag=flag
|
251 |
+
st.session_state.identifier=identifier
|
252 |
+
|
253 |
+
# st.subheader("Select Ordinal Columns:")
|
254 |
+
# with st.expander("Select Ordinal Columns:", expanded=True):
|
255 |
+
# select_all_checkbox = st.checkbox("Select All", key="select_all_checkbox")
|
256 |
+
|
257 |
+
# options = categorical_columns
|
258 |
+
|
259 |
+
# # Checkboxes for each column
|
260 |
+
# ordinal_columns = []
|
261 |
+
# for option in options:
|
262 |
+
# if select_all_checkbox or st.checkbox(option, key=f"checkbox_{option}"):
|
263 |
+
# ordinal_columns.append(option)
|
264 |
+
# st.session_state.ordinal_columns=list(ordinal_columns)
|
265 |
+
|
266 |
+
# nominal_columns=[x for x in categorical_columns if x not in ordinal_columns]
|
267 |
+
# st.session_state.numerical_columns=numerical_columns
|
268 |
+
# st.session_state.categorical_columns=categorical_columns
|
269 |
+
# st.session_state.ordinal_columns=ordinal_columns
|
270 |
+
|
271 |
+
#Ordinal columns order
|
272 |
+
# ordinal_col_dict = st.session_state.get("ordinal_col_dict", {})
|
273 |
+
|
274 |
+
# ordinal_col_dict = {}
|
275 |
+
|
276 |
+
# for col in ordinal_columns:
|
277 |
+
# st.subheader(f"Ordering for Unique Values in {col}")
|
278 |
+
|
279 |
+
# # Get unique values excluding NaN
|
280 |
+
# unique_values = st.session_state.df[col].dropna().unique()
|
281 |
+
|
282 |
+
# order_dict = {}
|
283 |
+
|
284 |
+
# for val in unique_values:
|
285 |
+
# order = st.number_input(f"Order for {val} in {col}", min_value=1, value=1)
|
286 |
+
# order_dict[val] = order
|
287 |
+
|
288 |
+
# ordinal_col_dict[col] = order_dict
|
289 |
+
|
290 |
+
# st.session_state.ordinal_col_dict = ordinal_col_dict
|
291 |
+
|
292 |
+
# User input for percentage threshold to drop columns
|
293 |
+
# per_to_drop = st.slider("Select Percentage Threshold to Drop Columns", min_value=0, max_value=100, value=10)
|
294 |
+
# st.session_state.per_to_drop = per_to_drop
|
295 |
+
|
296 |
+
summary_df = summary_stats(st.session_state.df, per_to_drop)
|
297 |
+
summary_df["Imputation method"]=None
|
298 |
+
summary_df["Outlier Treatment"]=None
|
299 |
+
summary_df["Imputation method"]=np.where(summary_df["type"]=='object','mode','mean')
|
300 |
+
summary_df["Outlier Treatment"]=np.where(summary_df["type"]=='object',summary_df["Outlier Treatment"],'capping')
|
301 |
+
summary_df = summary_df[~summary_df['variable'].isin([flag,identifier])]
|
302 |
+
st.session_state.summary_df=summary_df
|
303 |
+
|
304 |
+
st.subheader("Variable Summary")
|
305 |
+
|
306 |
+
IMPUTATION_OPTIONS = ["mean", "median", "linear interpolation", "polynomial interpolation", "ffill", "bfill","mode","fill_0"]
|
307 |
+
OUTLIER_OPTIONS = ["capping","remove", "mean", "median"]
|
308 |
+
NON_EDITABLE_COLUMNS = summary_df.columns.to_list()
|
309 |
+
|
310 |
+
def highlight_cols(s):
|
311 |
+
color = "#ccc"
|
312 |
+
return "background-color: %s" % color
|
313 |
+
|
314 |
+
column_config = {
|
315 |
+
"variable": st.column_config.TextColumn(disabled=True, width="medium"),
|
316 |
+
"type": st.column_config.TextColumn(disabled=True, width="medium"),
|
317 |
+
"%null": st.column_config.NumberColumn(disabled=True),
|
318 |
+
"unique%": st.column_config.NumberColumn(disabled=True),
|
319 |
+
"outlier%": st.column_config.NumberColumn(disabled=True),
|
320 |
+
"mean": st.column_config.NumberColumn(disabled=True),
|
321 |
+
"standard deviation": st.column_config.NumberColumn(disabled=True),
|
322 |
+
"variance": st.column_config.NumberColumn(disabled=True),
|
323 |
+
"skewness": st.column_config.NumberColumn(disabled=True),
|
324 |
+
"kurtosis": st.column_config.NumberColumn(disabled=True),
|
325 |
+
"min": st.column_config.NumberColumn(disabled=True),
|
326 |
+
"max": st.column_config.NumberColumn(disabled=True),
|
327 |
+
"range": st.column_config.NumberColumn(disabled=True),
|
328 |
+
"iqr": st.column_config.NumberColumn(disabled=True),
|
329 |
+
"median": st.column_config.NumberColumn(disabled=True),
|
330 |
+
"IV": st.column_config.NumberColumn(disabled=True),
|
331 |
+
"mode": st.column_config.TextColumn(disabled=True),
|
332 |
+
"distinct count": st.column_config.NumberColumn(disabled=True),
|
333 |
+
"Imputation method": st.column_config.SelectboxColumn(
|
334 |
+
options=IMPUTATION_OPTIONS, default=0
|
335 |
+
),
|
336 |
+
"Outlier Treatment": st.column_config.SelectboxColumn(
|
337 |
+
options=OUTLIER_OPTIONS, default=0
|
338 |
+
)
|
339 |
+
}
|
340 |
+
|
341 |
+
|
342 |
+
with st.expander("Variables from the data"):
|
343 |
+
edited_df = st.data_editor(
|
344 |
+
st.session_state.summary_df
|
345 |
+
.style.hide(axis="index")
|
346 |
+
.applymap(highlight_cols, subset=NON_EDITABLE_COLUMNS),
|
347 |
+
column_config=column_config,
|
348 |
+
)
|
349 |
+
if st.button("Submit changes"):
|
350 |
+
with st.spinner("Applying imputations"):
|
351 |
+
st.divider()
|
352 |
+
edited_df = st.session_state.summary_df.copy() # Make a copy of the original DataFrame
|
353 |
+
edited_df["Imputation method"] = st.session_state.summary_df["Imputation method"] # Update the imputation method column
|
354 |
+
edited_df["Outlier Treatment"] = st.session_state.summary_df["Outlier Treatment"] # Update the outlier treatment method column
|
355 |
+
|
356 |
+
imputed_df = imputed_df(st.session_state.df, edited_df, st.session_state.identifier, st.session_state.flag, st.session_state.per_to_drop)
|
357 |
+
st.session_state.imputed_df = imputed_df
|
358 |
+
st.markdown("Imputed DataFrame")
|
359 |
+
st.dataframe(imputed_df.head(10))
|
360 |
+
|
361 |
+
# Add a download button for the imputed DataFrame
|
362 |
+
#if st.session_state.imputed_df is not None:
|
363 |
+
# csv_data = st.session_state.imputed_df.to_csv(index=False).encode()
|
364 |
+
# st.download_button(
|
365 |
+
# label="Download Imputed DataFrame as CSV",
|
366 |
+
# data=csv_data,
|
367 |
+
# file_name="imputed_data.csv",
|
368 |
+
# mime="text/csv"
|
369 |
+
# )
|
370 |
+
|
371 |
+
# Add the download button after displaying the DataFrame
|
372 |
+
#if st.dataframe:
|
373 |
+
# if st.button("Download Imputed Data"):
|
374 |
+
# imputed_csv = imputed_df.to_csv(index=False)
|
375 |
+
# b64 = base64.b64encode(imputed_csv.encode()).decode()
|
376 |
+
# href = f'<a href="data:file/csv;base64,{b64}" download="imputed_data.csv">Download Imputed Data CSV File</a>'
|
377 |
+
# st.markdown(href, unsafe_allow_html=True)
|
378 |
+
|
379 |
+
if "imputed_df" in st.session_state:
|
380 |
+
if st.button("Download Imputed Data"):
|
381 |
+
imputed_df = st.session_state.imputed_df
|
382 |
+
imputed_csv = imputed_df.to_csv(index=False)
|
383 |
+
b64 = base64.b64encode(imputed_csv.encode()).decode()
|
384 |
+
href = f'<a href="data:file/csv;base64,{b64}" download="imputed_data.csv">Download Imputed Data CSV File</a>'
|
385 |
+
st.markdown(href, unsafe_allow_html=True)
|
386 |
+
|
387 |
+
|
388 |
+
|
389 |
+
|
390 |
+
# Check if the "Submit changes" button has been clicked
|
391 |
+
|
392 |
+
|
393 |
+
# if st.button("Submit"):
|
394 |
+
# st.write("Selected Columns and Ordinal Orders:")
|
395 |
+
# st.write(ordinal_col_dict)
|
396 |
+
|
397 |
+
# # Display summary stats
|
398 |
+
# summary_df = summary_stats(st.session_state.df, per_to_drop)
|
399 |
+
# st.write("Summary Stats:")
|
400 |
+
# st.write(summary_df)
|
401 |
+
|
402 |
+
# # User input for specific column
|
403 |
+
# col_name = st.selectbox("Select a specific column name:", [None] + list(st.session_state.df.columns))
|
404 |
+
|
405 |
+
# # Display stats for the specified column
|
406 |
+
# if col_name in st.session_state.df.columns:
|
407 |
+
# st.write(f"Stats for column '{col_name}':")
|
408 |
+
# # Extract relevant information from 'summary_df' for the specific column
|
409 |
+
# col_summary = summary_df[summary_df['variable'] == col_name][['%null', 'type', 'outlier%', 'unique%', 'mean', 'standard deviation', 'variance', 'skewness', 'kurtosis', 'min', 'max', 'range', 'iqr', 'median', 'mode', 'distinct count']]
|
410 |
+
# col_summary = col_summary.T.reset_index()
|
411 |
+
# col_summary.columns = ['Stats', 'Value']
|
412 |
+
# # Display the summary statistics as a table
|
413 |
+
# st.table(col_summary)
|
414 |
+
# else:
|
415 |
+
# st.warning("Please enter a valid column name.")
|
pages/pages/2_Profiling.py
ADDED
@@ -0,0 +1,775 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.tree import DecisionTreeClassifier
|
4 |
+
from sklearn.model_selection import GridSearchCV
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
from tqdm import tqdm
|
7 |
+
from matplotlib.ticker import MaxNLocator
|
8 |
+
import streamlit as st
|
9 |
+
import ast
|
10 |
+
from collections import defaultdict
|
11 |
+
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
|
12 |
+
from sklearn.cluster import KMeans, AgglomerativeClustering
|
13 |
+
from sklearn.preprocessing import LabelEncoder
|
14 |
+
#from kmodes.kmodes import KModes
|
15 |
+
import matplotlib.pyplot as plt
|
16 |
+
import seaborn as sns
|
17 |
+
#from kmodes.kprototypes import KPrototypes
|
18 |
+
import warnings
|
19 |
+
import pandas as pd
|
20 |
+
import numpy as np
|
21 |
+
from scipy import stats
|
22 |
+
import scipy.cluster.hierarchy as sch
|
23 |
+
from scipy.spatial.distance import pdist
|
24 |
+
import os
|
25 |
+
import re
|
26 |
+
import time
|
27 |
+
from plotly.subplots import make_subplots
|
28 |
+
import plotly.graph_objects as go
|
29 |
+
import numpy as np
|
30 |
+
import plotly.express as px
|
31 |
+
import base64
|
32 |
+
|
33 |
+
|
34 |
+
def tree_based_bin_data(df, column_name, dep_var, depth_of_tree):
|
35 |
+
df2 = df.copy()
|
36 |
+
df2 = df2.loc[df2[column_name].notnull()]
|
37 |
+
x = df2[column_name].values.reshape(-1, 1)
|
38 |
+
y = df2[dep_var].values
|
39 |
+
params = {'max_depth': range(2, depth_of_tree + 1), 'min_samples_split': [2, 3, 5, 10], 'min_samples_leaf': [int(np.ceil(0.05 * len(x)))]}
|
40 |
+
clf = DecisionTreeClassifier()
|
41 |
+
g_search = GridSearchCV(clf, param_grid=params, scoring='accuracy')
|
42 |
+
g_search.fit(x, y)
|
43 |
+
best_clf = g_search.best_estimator_
|
44 |
+
bin_edges = best_clf.tree_.threshold
|
45 |
+
bin_edges = sorted(set(bin_edges[bin_edges != -2]))
|
46 |
+
tree_based_binned_data = value_bin_data(df, column_name, bin_edges)
|
47 |
+
return tree_based_binned_data
|
48 |
+
|
49 |
+
|
50 |
+
def decile_bin_data(df, col, no_of_bins):
|
51 |
+
decile_binned_data = pd.qcut(df[col], no_of_bins, duplicates='drop')
|
52 |
+
return decile_binned_data
|
53 |
+
|
54 |
+
|
55 |
+
def value_bin_data(df, col, no_of_bins):
|
56 |
+
value_binned_data = pd.cut(df[col], no_of_bins, duplicates='drop')
|
57 |
+
return value_binned_data
|
58 |
+
|
59 |
+
|
60 |
+
def col_bin_summary_numerical(bin_df, col, dep_var=None):
|
61 |
+
unique_bin_edges = bin_df[col].unique()
|
62 |
+
df_new = pd.DataFrame({"bin_ranges": unique_bin_edges})
|
63 |
+
|
64 |
+
try:
|
65 |
+
df_new = df_new.merge((bin_df[col].value_counts() / len(bin_df) * 100).reset_index().rename(columns={'index': 'bin_ranges', col: 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2)
|
66 |
+
except:
|
67 |
+
df_new = df_new.merge((bin_df[col].value_counts() / len(bin_df) * 100).reset_index().rename(columns={col: 'bin_ranges', 'count': 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2)
|
68 |
+
if dep_var is not None:
|
69 |
+
df_new = df_new.merge(bin_df.groupby(col)[dep_var].sum().reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Event'}), on='bin_ranges', how='left')
|
70 |
+
df_new = df_new.merge(bin_df.groupby(col)[dep_var].mean().reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Mean_DV'}), on='bin_ranges', how='left')
|
71 |
+
df_new['Index'] = (100 * df_new['Mean_DV'] / bin_df['Y'].mean()).round()
|
72 |
+
df_new = df_new[['bin_ranges', 'count%', 'Event', 'Mean_DV', 'Index']]
|
73 |
+
df_new = df_new.sort_values(by='bin_ranges')
|
74 |
+
|
75 |
+
return df_new
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
def plot_chart(df, col, dep_var):
|
82 |
+
#fig = go.Figure()
|
83 |
+
df['bin_ranges_str'] = df['bin_ranges'].astype(str)
|
84 |
+
fig = make_subplots(specs=[[{"secondary_y": True}]])
|
85 |
+
# Bar trace for Count%
|
86 |
+
|
87 |
+
fig.add_trace(
|
88 |
+
go.Bar(
|
89 |
+
x=df['bin_ranges_str'],
|
90 |
+
y=df['count%'],
|
91 |
+
name='Count%',
|
92 |
+
marker_color='#053057',
|
93 |
+
hovertemplate=(
|
94 |
+
f"Bin: %{{x}}<br>"
|
95 |
+
f"Count%: %{{y}}"
|
96 |
+
),
|
97 |
+
)
|
98 |
+
)
|
99 |
+
|
100 |
+
# Add the line trace for Index on the secondary y-axis
|
101 |
+
fig.add_trace(
|
102 |
+
go.Scatter(
|
103 |
+
x=df['bin_ranges_str'],
|
104 |
+
y=df['Index'],
|
105 |
+
mode='lines+markers',
|
106 |
+
name='Index',
|
107 |
+
marker=dict(color="#8ac4f8"),
|
108 |
+
hovertemplate=(
|
109 |
+
f"Bin: %{{x}}<br>"
|
110 |
+
f"Index%: %{{y}}"
|
111 |
+
),
|
112 |
+
),
|
113 |
+
secondary_y=True
|
114 |
+
)
|
115 |
+
|
116 |
+
# Update layout
|
117 |
+
fig.update_layout(
|
118 |
+
title=f'Distribution of {col}',
|
119 |
+
xaxis=dict(title='Bin_ranges'),
|
120 |
+
yaxis=dict(title='Count%', color='#053057'),
|
121 |
+
yaxis2=dict(title='Index', color="#8ac4f8", overlaying='y', side='right'),
|
122 |
+
legend=dict(x=1.02, y=0.98),
|
123 |
+
hovermode='x'
|
124 |
+
)
|
125 |
+
|
126 |
+
fig.update_xaxes(showgrid=False)
|
127 |
+
fig.update_yaxes(showgrid=False)
|
128 |
+
|
129 |
+
return fig
|
130 |
+
|
131 |
+
# def plot_chart(df, col, dep_var=None):
|
132 |
+
# fig, ax1 = plt.subplots(figsize=(10, 6))
|
133 |
+
|
134 |
+
# # Convert Interval type to string
|
135 |
+
# df['bin_ranges_str'] = df['bin_ranges'].astype(str)
|
136 |
+
|
137 |
+
# ax1.bar(df['bin_ranges_str'], df['count%'], color='b', alpha=0.7, label='Count%')
|
138 |
+
# ax1.set_xlabel('Bin Ranges')
|
139 |
+
# ax1.set_ylabel('Count%', color='b')
|
140 |
+
|
141 |
+
# if dep_var is not None:
|
142 |
+
# ax2 = ax1.twinx()
|
143 |
+
# ax2.plot(df['bin_ranges_str'], df['Index'], color='r', marker='o', label='Index')
|
144 |
+
# ax2.set_ylabel('Index', color='r')
|
145 |
+
|
146 |
+
# ax1.set_title(f'Distribution of {col}')
|
147 |
+
# ax1.legend(loc='upper left')
|
148 |
+
|
149 |
+
# return st.plotly_chart(fig)
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
|
155 |
+
def create_numerical_binned_data(df, col, func,no_of_bins=None,dep_var=None, depth=None):
|
156 |
+
df_org = df.copy()
|
157 |
+
|
158 |
+
if dep_var is not None:
|
159 |
+
df_org[dep_var] = df_org[dep_var].astype('int64')
|
160 |
+
df_num = df_org.select_dtypes(include=[np.number]).drop(dep_var, axis=1)
|
161 |
+
|
162 |
+
if func == 'tree':
|
163 |
+
bin_df = tree_based_bin_data(df, col, dep_var, depth)
|
164 |
+
elif func == 'decile':
|
165 |
+
bin_df = decile_bin_data(df_num, col, 10)
|
166 |
+
else:
|
167 |
+
bin_df = value_bin_data(df_num, col, no_of_bins)
|
168 |
+
|
169 |
+
bin_df = pd.concat([bin_df, df_org[dep_var]], axis=1)
|
170 |
+
else:
|
171 |
+
df_num = df_org.select_dtypes(include=[np.number])
|
172 |
+
|
173 |
+
if func == 'decile':
|
174 |
+
bin_df = decile_bin_data(df_num, col, no_of_bins)
|
175 |
+
else:
|
176 |
+
bin_df = value_bin_data(df_num, col, no_of_bins)
|
177 |
+
|
178 |
+
df_summary = col_bin_summary_numerical(bin_df,col, dep_var)
|
179 |
+
|
180 |
+
return df_summary
|
181 |
+
|
182 |
+
|
183 |
+
def create_numerical_binned_data1(df, col, func,no_of_bins,dep_var,depth=None):
|
184 |
+
df_org = df.copy()
|
185 |
+
|
186 |
+
df_org[dep_var] = df_org[dep_var].astype('int64')
|
187 |
+
df_num = df_org.select_dtypes(include=[np.number]).drop(dep_var, axis=1)
|
188 |
+
|
189 |
+
if func == 'tree':
|
190 |
+
bin_df = tree_based_bin_data(df, col, dep_var, depth)
|
191 |
+
elif func == 'decile':
|
192 |
+
bin_df = decile_bin_data(df_num, col, no_of_bins)
|
193 |
+
else:
|
194 |
+
bin_df = value_bin_data(df_num, col, no_of_bins)
|
195 |
+
|
196 |
+
bin_df = pd.concat([bin_df, df_org[dep_var]], axis=1)
|
197 |
+
|
198 |
+
binned_data=pd.DataFrame()
|
199 |
+
binned_data[col]=df_org[col]
|
200 |
+
unique_bins = bin_df[col].unique()
|
201 |
+
for bin_value in unique_bins:
|
202 |
+
bin_column_name = f"{col}_{bin_value}"
|
203 |
+
binned_data[bin_column_name] = np.where(binned_data[col] == bin_value, df_org[col], 0)
|
204 |
+
|
205 |
+
return binned_data
|
206 |
+
|
207 |
+
|
208 |
+
#Categorical cols binning
|
209 |
+
|
210 |
+
def woe_iv(df, column_name, dep_var, no_of_bins):
|
211 |
+
y0 = df[dep_var].value_counts()[0]
|
212 |
+
y1 = df[dep_var].value_counts()[1]
|
213 |
+
if df[column_name].nunique() < 10:
|
214 |
+
data = pd.Series(pd.factorize(df[column_name])[0] + 1, index=df.index).rename('{}'.format(column_name)).apply(lambda x: f'bin{x}')
|
215 |
+
else:
|
216 |
+
df_woe_iv = (pd.crosstab(df[column_name], df[dep_var], normalize='columns').assign(woe=lambda dfx: np.log((dfx[1] + (0.5 / y1)) / (dfx[0] + (0.5 / y0)))).assign(iv=lambda dfx: (dfx['woe'] * (dfx[1] - dfx[0]))))
|
217 |
+
woe_map = df_woe_iv['woe'].to_dict()
|
218 |
+
woe_col = df[column_name].map(woe_map)
|
219 |
+
data = pd.qcut(woe_col, no_of_bins, duplicates='drop')
|
220 |
+
n = data.nunique()
|
221 |
+
labels = [f'bin{i}' for i in range(1, n + 1)]
|
222 |
+
data = data.cat.rename_categories(labels)
|
223 |
+
sizes = data.value_counts(normalize=True)
|
224 |
+
min_size = 0.05
|
225 |
+
while sizes.min() < min_size and no_of_bins > 1:
|
226 |
+
no_of_bins -= 1
|
227 |
+
data = pd.qcut(woe_col, q=no_of_bins, duplicates='drop')
|
228 |
+
if data.nunique() != data.cat.categories.nunique():
|
229 |
+
continue
|
230 |
+
n = data.nunique()
|
231 |
+
labels = [f'bin{i}' for i in range(1, n + 1)]
|
232 |
+
data = data.cat.rename_categories(labels)
|
233 |
+
sizes = data.value_counts(normalize=True)
|
234 |
+
return data
|
235 |
+
|
236 |
+
def naive_cat_bin(df, col, max_thre=10, min_thre=5, tolerence=2, flag='ignore'):
|
237 |
+
value_counts = df[col].value_counts()
|
238 |
+
total_values = len(df)
|
239 |
+
count_percentages = (value_counts / total_values) * 100
|
240 |
+
unique_values_df = pd.DataFrame({'Category': value_counts.index, 'Count Percentage': count_percentages})
|
241 |
+
count_per = list(unique_values_df['Count Percentage'])
|
242 |
+
|
243 |
+
final_ini = []
|
244 |
+
for i in count_per:
|
245 |
+
if i >= min_thre:
|
246 |
+
final_ini.append(i)
|
247 |
+
a = [x for x in count_per if x not in final_ini]
|
248 |
+
|
249 |
+
total_bins = int(100 / max_thre)
|
250 |
+
ava_bins = len(final_ini)
|
251 |
+
ava_bin_per = sum(final_ini)
|
252 |
+
bin_req = total_bins - ava_bins
|
253 |
+
bin_req_per = 100 - ava_bin_per
|
254 |
+
|
255 |
+
if flag == 'error' and bin_req > 0 and (bin_req_per / bin_req) > max_thre:
|
256 |
+
print(f"Binning for {col} is not possible with given parameters.")
|
257 |
+
return
|
258 |
+
|
259 |
+
step = False
|
260 |
+
while not step:
|
261 |
+
if bin_req > 0:
|
262 |
+
if (bin_req_per / bin_req) > min_thre:
|
263 |
+
step = True
|
264 |
+
else:
|
265 |
+
bin_req -= 1
|
266 |
+
else:
|
267 |
+
step = True
|
268 |
+
|
269 |
+
final_ini = [[x] for x in final_ini]
|
270 |
+
|
271 |
+
if bin_req > 0:
|
272 |
+
target_sum = bin_req_per / bin_req
|
273 |
+
else:
|
274 |
+
target_sum = bin_req_per
|
275 |
+
tolerence = 0
|
276 |
+
|
277 |
+
final = []
|
278 |
+
current_sum = 0.0
|
279 |
+
start_index = len(a) - 1
|
280 |
+
values = []
|
281 |
+
while start_index >= 0:
|
282 |
+
current_sum += a[start_index]
|
283 |
+
values.append(a[start_index])
|
284 |
+
if current_sum < target_sum - tolerence:
|
285 |
+
start_index -= 1
|
286 |
+
else:
|
287 |
+
final.append(values)
|
288 |
+
values = []
|
289 |
+
start_index -= 1
|
290 |
+
current_sum = 0.0
|
291 |
+
final.append(values)
|
292 |
+
final = final[::-1]
|
293 |
+
final = [sublist for sublist in final if sublist]
|
294 |
+
final_b = final_ini + final
|
295 |
+
|
296 |
+
final = [final_b[0]]
|
297 |
+
for subarr in final_b[1:]:
|
298 |
+
if sum(subarr) < (min_thre - tolerence):
|
299 |
+
final[-1].extend(subarr)
|
300 |
+
else:
|
301 |
+
final.append(subarr)
|
302 |
+
|
303 |
+
table = dict(zip(unique_values_df['Category'], unique_values_df['Count Percentage']))
|
304 |
+
new_final = [sublist.copy() for sublist in final]
|
305 |
+
|
306 |
+
table_reverse = defaultdict(list)
|
307 |
+
for k, v in table.items():
|
308 |
+
table_reverse[v].append(k)
|
309 |
+
|
310 |
+
output = []
|
311 |
+
for l in new_final:
|
312 |
+
temp = []
|
313 |
+
for item in l:
|
314 |
+
temp.append(table_reverse[item].pop())
|
315 |
+
output.append(temp)
|
316 |
+
new_final = output
|
317 |
+
|
318 |
+
k = len(new_final)
|
319 |
+
bin_labels = [f'bin{i}' for i in range(1, k + 1)]
|
320 |
+
bin_mapping = {value: bin_labels[i] for i, sublist in enumerate(new_final) for value in sublist}
|
321 |
+
bin_mapping[np.nan] = 'binNA'
|
322 |
+
return df[col].apply(lambda x: bin_mapping.get(x, x))
|
323 |
+
|
324 |
+
def col_bin_summary_categorical(df_cat, col, binned_df_1,dep_var=None):
|
325 |
+
unique_values_in_bins = df_cat.groupby(binned_df_1[col])[col].unique().apply(list)
|
326 |
+
unique_values_in_bins = unique_values_in_bins.rename_axis('bin').reset_index()
|
327 |
+
unique_bin_ranges = pd.Categorical(binned_df_1[col].unique())
|
328 |
+
uni = binned_df_1[col].nunique()
|
329 |
+
numeric_parts = [uni if val == 'binNA' else int(re.findall(r'\d+', val)[0]) for val in unique_bin_ranges]
|
330 |
+
unique_bin_ranges = unique_bin_ranges[np.argsort(numeric_parts)]
|
331 |
+
df_new_cat = pd.DataFrame({"column_name": [col] * len(unique_bin_ranges), "bin_ranges": unique_bin_ranges})
|
332 |
+
df_new_cat = df_new_cat.merge(unique_values_in_bins.rename(columns={'bin': 'bin_ranges', col: 'values in bin'}))
|
333 |
+
df_new_cat = df_new_cat.merge((binned_df_1[col].value_counts() / len(binned_df_1) * 100).reset_index().rename(columns={col: 'bin_ranges', 'count': 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2)
|
334 |
+
if dep_var is not None:
|
335 |
+
df_new_cat = df_new_cat.merge(binned_df_1.groupby(col)[dep_var].sum(numeric_only=True).reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Event'}), on='bin_ranges')
|
336 |
+
df_new_cat = df_new_cat.merge(binned_df_1.groupby(col)[dep_var].mean(numeric_only=True).reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Mean_DV'}), on='bin_ranges')
|
337 |
+
df_new_cat['Index'] = (100 * df_new_cat['Mean_DV'] / binned_df_1[dep_var].mean()).round()
|
338 |
+
return df_new_cat
|
339 |
+
|
340 |
+
def create_categorical_binned_data(imputed_df,col, categorical_binning, dep_var, no_of_bins=None, max_thre=None, min_thre=None,tolerence=2, flag='ignore'):
|
341 |
+
|
342 |
+
imputed_df[dep_var] = imputed_df[dep_var].astype('int64')
|
343 |
+
df_cat = imputed_df.select_dtypes(include=['object'])
|
344 |
+
# remove columns with only one unique values
|
345 |
+
unique_counts = df_cat.nunique()
|
346 |
+
unique_cols = unique_counts[unique_counts == 1].index.tolist()
|
347 |
+
df_cat = df_cat.drop(unique_cols, axis=1)
|
348 |
+
|
349 |
+
if categorical_binning == 'woe_iv':
|
350 |
+
df_nominal = pd.concat([imputed_df[col], imputed_df[dep_var]], axis=1)
|
351 |
+
tqdm.pandas(dynamic_ncols=True, position=0)
|
352 |
+
binned_df_nominal = df_nominal.progress_apply(lambda x: woe_iv(df_nominal, x.name, dep_var, no_of_bins))
|
353 |
+
binned_df_nominal.drop(dep_var, axis=1, inplace=True)
|
354 |
+
binned_df_nominal = binned_df_nominal.applymap(lambda x: 'NA' if pd.isnull(x) else x)
|
355 |
+
binned_df_nominal = binned_df_nominal.astype('category')
|
356 |
+
|
357 |
+
cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
|
358 |
+
binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
|
359 |
+
|
360 |
+
binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dep_var]], axis=1)
|
361 |
+
elif categorical_binning == 'naive':
|
362 |
+
df_nominal = pd.concat([imputed_df[col], imputed_df[dep_var]], axis=1)
|
363 |
+
tqdm.pandas(dynamic_ncols=True, position=0)
|
364 |
+
binned_df_nominal = df_nominal.progress_apply(lambda x: naive_cat_bin(df_nominal, x.name, 20, 5, 2, flag='ignore'))
|
365 |
+
binned_df_nominal.drop(dep_var, axis=1, inplace=True)
|
366 |
+
binned_df_nominal = binned_df_nominal.dropna(axis=1, how='all')
|
367 |
+
binned_df_nominal = binned_df_nominal.astype('category')
|
368 |
+
|
369 |
+
cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
|
370 |
+
binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
|
371 |
+
|
372 |
+
binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dep_var]], axis=1)
|
373 |
+
|
374 |
+
df_summary=col_bin_summary_categorical(df_cat, col, binned_df_nominal_1,dep_var)
|
375 |
+
return df_summary
|
376 |
+
|
377 |
+
def create_categorical_binned_data1(imputed_df,col, nominal_binning, dependant_target_variable, no_of_bins=10, max_thre=10, min_thre=5, tolerence=2, flag='ignore', min_cluster_size=0.05, max_clusters=10):
|
378 |
+
|
379 |
+
imputed_df[dependant_target_variable] = imputed_df[dependant_target_variable].astype('int64')
|
380 |
+
df_cat = imputed_df.select_dtypes(include=['object'])
|
381 |
+
# remove columns with only one unique values
|
382 |
+
unique_counts = df_cat.nunique()
|
383 |
+
unique_cols = unique_counts[unique_counts == 1].index.tolist()
|
384 |
+
df_cat = df_cat.drop(unique_cols, axis=1)
|
385 |
+
|
386 |
+
if nominal_binning == 'woe':
|
387 |
+
df_nominal = pd.concat([imputed_df[col], imputed_df[dependant_target_variable]], axis=1)
|
388 |
+
tqdm.pandas(dynamic_ncols=True, position=0)
|
389 |
+
binned_df_nominal = df_nominal.progress_apply(lambda x: woe_iv(df_nominal, x.name, dependant_target_variable, no_of_bins))
|
390 |
+
binned_df_nominal.drop(dependant_target_variable, axis=1, inplace=True)
|
391 |
+
binned_df_nominal = binned_df_nominal.applymap(lambda x: 'NA' if pd.isnull(x) else x)
|
392 |
+
binned_df_nominal = binned_df_nominal.astype('category')
|
393 |
+
|
394 |
+
cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
|
395 |
+
binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
|
396 |
+
|
397 |
+
binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dependant_target_variable]], axis=1)
|
398 |
+
elif nominal_binning == 'naive':
|
399 |
+
df_nominal = pd.concat([imputed_df[col], imputed_df[dependant_target_variable]], axis=1)
|
400 |
+
tqdm.pandas(dynamic_ncols=True, position=0)
|
401 |
+
binned_df_nominal = df_nominal.progress_apply(lambda x: naive_cat_bin(df_nominal, x.name, 20, 5, 2, flag='ignore'))
|
402 |
+
binned_df_nominal.drop(dependant_target_variable, axis=1, inplace=True)
|
403 |
+
binned_df_nominal = binned_df_nominal.dropna(axis=1, how='all')
|
404 |
+
binned_df_nominal = binned_df_nominal.astype('category')
|
405 |
+
|
406 |
+
cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
|
407 |
+
binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
|
408 |
+
|
409 |
+
binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dependant_target_variable]], axis=1)
|
410 |
+
|
411 |
+
df_summary=col_bin_summary_categorical(df_cat, col, binned_df_nominal_1,dependant_target_variable)
|
412 |
+
|
413 |
+
binned_data = pd.DataFrame()
|
414 |
+
for bin_value in df_summary['values in bin']:
|
415 |
+
bin_column_name = f"{col}_{bin_value}"
|
416 |
+
binned_data[bin_column_name] = np.where(df_cat[col].isin(bin_value), 1, 0)
|
417 |
+
|
418 |
+
return binned_data
|
419 |
+
|
420 |
+
|
421 |
+
|
422 |
+
numerical_columns = st.session_state.imputed_df.select_dtypes(include=['number']).columns.tolist()
|
423 |
+
numerical_columns = [x for x in numerical_columns if x != st.session_state.flag]
|
424 |
+
categorical_columns = st.session_state.imputed_df.select_dtypes(include=['object', 'category']).columns.tolist()
|
425 |
+
categorical_columns = [x for x in categorical_columns if x != st.session_state.identifier]
|
426 |
+
st.session_state.numerical_columns=numerical_columns
|
427 |
+
st.session_state.categorical_columns=categorical_columns
|
428 |
+
|
429 |
+
|
430 |
+
st.title("Variable Profiling")
|
431 |
+
|
432 |
+
# Retrieve stored options from session_state or use default values
|
433 |
+
function_num = st.session_state.get("function_num", "value")
|
434 |
+
depth = st.session_state.get("depth", 3)
|
435 |
+
num_bins = st.session_state.get("num_bins", 10)
|
436 |
+
function_cat = st.session_state.get("function_cat", "woe_iv")
|
437 |
+
max_slider = st.session_state.get("max_slider", 10)
|
438 |
+
min_slider = st.session_state.get("min_slider", 5)
|
439 |
+
cat_bins_iv = st.session_state.get("cat_bins_iv", 10)
|
440 |
+
cat_bins_naive = st.session_state.get("cat_bins_naive", 10)
|
441 |
+
|
442 |
+
with st.expander("Profiling Inputs"):
|
443 |
+
st.write("Binning Inputs")
|
444 |
+
ui_columns = st.columns((1, 1))
|
445 |
+
with ui_columns[0]:
|
446 |
+
function_num = st.selectbox(
|
447 |
+
label="Select Numerical Binning Function",
|
448 |
+
options=['value', 'tree'],
|
449 |
+
#index=None
|
450 |
+
index=['value', 'tree'].index(st.session_state.function_num) if 'function_num' in st.session_state and st.session_state.function_num is not None else None
|
451 |
+
)
|
452 |
+
st.session_state.function_num = function_num # Store selected option
|
453 |
+
params_num = st.empty()
|
454 |
+
|
455 |
+
with params_num:
|
456 |
+
with ui_columns[-1]:
|
457 |
+
if function_num == 'tree':
|
458 |
+
depth = st.slider(
|
459 |
+
label="Depth",
|
460 |
+
min_value=1,
|
461 |
+
max_value=10,
|
462 |
+
value=depth,
|
463 |
+
key='depth_slider')
|
464 |
+
st.session_state.depth = depth # Store selected depth
|
465 |
+
elif function_num == 'value':
|
466 |
+
num_bins = st.slider(
|
467 |
+
label="Number of Bins",
|
468 |
+
min_value=2,
|
469 |
+
max_value=20,
|
470 |
+
value=num_bins,
|
471 |
+
key='num_bins_slider_num')
|
472 |
+
st.session_state.num_bins = num_bins # Store selected number of bins
|
473 |
+
left, right = st.columns(2)
|
474 |
+
|
475 |
+
with left:
|
476 |
+
function_cat = st.selectbox(
|
477 |
+
label="Select Categorical Binning Function",
|
478 |
+
options=['woe_iv', 'naive'],
|
479 |
+
#index=None
|
480 |
+
index=['woe_iv', 'naive'].index(st.session_state.function_cat) if 'function_cat' in st.session_state and st.session_state.function_cat is not None else None
|
481 |
+
)
|
482 |
+
st.session_state.function_cat = function_cat # Store selected option
|
483 |
+
params_cat = st.empty()
|
484 |
+
|
485 |
+
with params_cat:
|
486 |
+
|
487 |
+
if function_cat == 'woe_iv':
|
488 |
+
with right:
|
489 |
+
cat_bins_iv = st.slider(
|
490 |
+
label="Number of Bins",
|
491 |
+
min_value=2,
|
492 |
+
max_value=20,
|
493 |
+
value=cat_bins_iv,
|
494 |
+
key='num_bins_slider_cat_iv')
|
495 |
+
st.session_state.cat_bins_iv = cat_bins_iv # Store selected number of bins
|
496 |
+
with left:
|
497 |
+
min_slider = st.slider(
|
498 |
+
label="Min Threshold",
|
499 |
+
min_value=1,
|
500 |
+
max_value=100,
|
501 |
+
value=min_slider,
|
502 |
+
key='min_slider')
|
503 |
+
st.session_state.min_slider = min_slider # Store selected min threshold
|
504 |
+
with right:
|
505 |
+
max_slider = st.slider(
|
506 |
+
label="Max Threshold",
|
507 |
+
min_value=1,
|
508 |
+
max_value=100,
|
509 |
+
value=max_slider,
|
510 |
+
key='max_slider')
|
511 |
+
st.session_state.max_slider = max_slider # Store selected max threshold
|
512 |
+
elif function_cat == 'naive':
|
513 |
+
with right:
|
514 |
+
cat_bins_naive = st.slider(
|
515 |
+
label="Number of Bins",
|
516 |
+
min_value=2,
|
517 |
+
max_value=20,
|
518 |
+
value=cat_bins_naive,
|
519 |
+
key='num_bins_slider_cat_naive')
|
520 |
+
st.session_state.cat_bins_naive = cat_bins_naive # Store selected number of bins
|
521 |
+
|
522 |
+
with left:
|
523 |
+
st.write("#")
|
524 |
+
perform_profiling = st.button(
|
525 |
+
label="Perform profiling"
|
526 |
+
)
|
527 |
+
|
528 |
+
|
529 |
+
# if perform_profiling:
|
530 |
+
# binned_data_num = pd.DataFrame()
|
531 |
+
# for col in st.session_state.numerical_columns:
|
532 |
+
# if function_num == 'tree':
|
533 |
+
# depth = depth
|
534 |
+
# else:
|
535 |
+
# depth=None
|
536 |
+
# if function_num == 'value':
|
537 |
+
# num_bins=num_bins
|
538 |
+
# else:
|
539 |
+
# num_bins=None
|
540 |
+
# binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth)
|
541 |
+
# binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
|
542 |
+
# binned_data_num = pd.concat([binned_data_num, binned_data_col],axis=0)
|
543 |
+
# st.markdown("binned_data_num")
|
544 |
+
# st.dataframe(binned_data_num,use_container_width=True,hide_index=True)
|
545 |
+
|
546 |
+
if perform_profiling:
|
547 |
+
with st.expander("Profiling summary"):
|
548 |
+
st.write("Numerical binned data")
|
549 |
+
binned_data_num = pd.DataFrame()
|
550 |
+
for col in st.session_state.numerical_columns:
|
551 |
+
if function_num == 'tree':
|
552 |
+
depth = depth
|
553 |
+
else:
|
554 |
+
depth=None
|
555 |
+
if function_num == 'value':
|
556 |
+
num_bins=num_bins
|
557 |
+
else:
|
558 |
+
num_bins=None
|
559 |
+
binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth)
|
560 |
+
binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
|
561 |
+
binned_data_num = pd.concat([binned_data_num, binned_data_col],axis=0)
|
562 |
+
st.dataframe(binned_data_num,use_container_width=True,hide_index=True)
|
563 |
+
|
564 |
+
st.write("Categorical binned data")
|
565 |
+
binned_data_cat = pd.DataFrame()
|
566 |
+
for col in st.session_state.categorical_columns:
|
567 |
+
if function_cat == 'woe_iv':
|
568 |
+
max_thre = max_slider
|
569 |
+
min_thre = min_slider
|
570 |
+
no_of_bins = cat_bins_iv
|
571 |
+
else:
|
572 |
+
max_thre = None
|
573 |
+
min_thre = None
|
574 |
+
no_of_bins = None
|
575 |
+
if function_cat == 'naive':
|
576 |
+
no_of_bins = cat_bins_naive
|
577 |
+
else:
|
578 |
+
no_of_bins=None
|
579 |
+
binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df,col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre,tolerence=2, flag='ignore')
|
580 |
+
binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str))
|
581 |
+
binned_data_col_cat.drop('column_name',axis=1,inplace=True)
|
582 |
+
binned_data_cat = pd.concat([binned_data_cat, binned_data_col_cat],axis=0)
|
583 |
+
st.dataframe(binned_data_cat,use_container_width=True,hide_index=True)
|
584 |
+
|
585 |
+
|
586 |
+
with st.expander("Profiling summary: Plots"):
|
587 |
+
st.markdown(
|
588 |
+
"<p class='plot-header'>Change the selected variable to plot"
|
589 |
+
" different charts</p>",
|
590 |
+
unsafe_allow_html=True,
|
591 |
+
)
|
592 |
+
left, right = st.columns(2)
|
593 |
+
with left:
|
594 |
+
if 'selected_variable' not in st.session_state:
|
595 |
+
st.session_state.selected_variable = [] # Initialize selected_variable
|
596 |
+
|
597 |
+
selected_variable = st.selectbox(
|
598 |
+
"Variable",
|
599 |
+
st.session_state.numerical_columns + st.session_state.categorical_columns,
|
600 |
+
# index=None
|
601 |
+
)
|
602 |
+
if isinstance(selected_variable, str):
|
603 |
+
selected_variable = [selected_variable] # Convert single selection to list
|
604 |
+
|
605 |
+
# Update session state with selected variable
|
606 |
+
st.session_state.selected_variable = selected_variable
|
607 |
+
|
608 |
+
|
609 |
+
# Iterate over selected variable(s)
|
610 |
+
if st.session_state.selected_variable:
|
611 |
+
for col in st.session_state.selected_variable:
|
612 |
+
if col in st.session_state.numerical_columns:
|
613 |
+
if function_num == 'tree':
|
614 |
+
depth = depth
|
615 |
+
else:
|
616 |
+
depth = None
|
617 |
+
if function_num == 'value':
|
618 |
+
num_bins = num_bins
|
619 |
+
else:
|
620 |
+
num_bins = None
|
621 |
+
binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num, num_bins, st.session_state.flag, depth)
|
622 |
+
binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
|
623 |
+
fig = plot_chart(binned_data_col, col, dep_var=None)
|
624 |
+
st.plotly_chart(fig, use_container_width=True)
|
625 |
+
|
626 |
+
elif col in st.session_state.categorical_columns:
|
627 |
+
if function_cat == 'woe_iv':
|
628 |
+
max_thre = max_slider
|
629 |
+
min_thre = min_slider
|
630 |
+
no_of_bins = cat_bins_iv
|
631 |
+
else:
|
632 |
+
max_thre = None
|
633 |
+
min_thre = None
|
634 |
+
no_of_bins = None
|
635 |
+
if function_cat == 'naive':
|
636 |
+
no_of_bins = cat_bins_naive
|
637 |
+
else:
|
638 |
+
no_of_bins = None
|
639 |
+
binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df, col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre, tolerence=2, flag='ignore')
|
640 |
+
binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str))
|
641 |
+
binned_data_col_cat.drop('column_name', axis=1, inplace=True)
|
642 |
+
fig_cat = plot_chart(binned_data_col_cat, col, dep_var=None)
|
643 |
+
st.plotly_chart(fig_cat, use_container_width=True)
|
644 |
+
|
645 |
+
|
646 |
+
st.divider()
|
647 |
+
# Combine numerical and categorical binned data into one dataframe
|
648 |
+
binned_data_combined = pd.DataFrame()
|
649 |
+
|
650 |
+
# Process numerical columns
|
651 |
+
for col in st.session_state.numerical_columns:
|
652 |
+
if function_num == 'tree':
|
653 |
+
depth = depth
|
654 |
+
else:
|
655 |
+
depth=None
|
656 |
+
if function_num == 'value':
|
657 |
+
num_bins=num_bins
|
658 |
+
else:
|
659 |
+
num_bins=None
|
660 |
+
# Your code to create numerical binned data
|
661 |
+
binned_data_num = create_numerical_binned_data1(st.session_state.imputed_df, col, function_num, num_bins, st.session_state.flag, depth)
|
662 |
+
binned_data_combined = pd.concat([binned_data_combined, binned_data_num], axis=1)
|
663 |
+
|
664 |
+
# Process categorical columns
|
665 |
+
for col in st.session_state.categorical_columns:
|
666 |
+
if function_cat == 'woe_iv':
|
667 |
+
max_thre = max_slider
|
668 |
+
min_thre = min_slider
|
669 |
+
no_of_bins = cat_bins_iv
|
670 |
+
else:
|
671 |
+
max_thre = None
|
672 |
+
min_thre = None
|
673 |
+
no_of_bins = None
|
674 |
+
if function_cat == 'naive':
|
675 |
+
no_of_bins = cat_bins_naive
|
676 |
+
else:
|
677 |
+
no_of_bins=None
|
678 |
+
# Your code to create categorical binned data
|
679 |
+
binned_data_cat = create_categorical_binned_data1(st.session_state.imputed_df, col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre, tolerence=2, flag='ignore')
|
680 |
+
binned_data_combined = pd.concat([binned_data_combined, binned_data_cat], axis=1)
|
681 |
+
def clean_column_name(column_name):
|
682 |
+
# Replace special characters with underscores except for the decimal point
|
683 |
+
return re.sub(r'\.(\d+)', '', column_name)
|
684 |
+
binned_data_combined.columns = binned_data_combined.columns.map(clean_column_name)
|
685 |
+
valid_feature_names = [name.replace('[', '').replace(']', '').replace('<', '').replace(',', '_').replace('(', '').replace("'", '') for name in binned_data_combined.columns]
|
686 |
+
valid_feature_names = [name.replace(' ', '').replace(' ', '') for name in valid_feature_names]
|
687 |
+
binned_data_combined.columns = valid_feature_names
|
688 |
+
# Display the combined binned data dataframe
|
689 |
+
st.session_state.binned_df = binned_data_combined
|
690 |
+
st.session_state.binned_df[st.session_state.flag]=st.session_state.imputed_df[st.session_state.flag]
|
691 |
+
st.session_state.binned_df.insert(0, st.session_state.identifier, st.session_state.imputed_df[st.session_state.identifier])
|
692 |
+
print(st.session_state.binned_df['individual_id_ov'])
|
693 |
+
#st.session_state.binned_df[st.session_state.identifier]=st.session_state.imputed_df[st.session_state.identifier]
|
694 |
+
st.markdown("Binned DataFrame")
|
695 |
+
st.dataframe(binned_data_combined.head(10), use_container_width=True, hide_index=True)
|
696 |
+
|
697 |
+
# Add a button to download the binned dataframe
|
698 |
+
if st.session_state.binned_df is not None:
|
699 |
+
#with st.expander("Download Binned Data"):
|
700 |
+
download_button = st.download_button(
|
701 |
+
label="Download Binned Data as CSV",
|
702 |
+
data=st.session_state.binned_df.to_csv(index=False).encode(),
|
703 |
+
file_name='binned_data.csv',
|
704 |
+
mime='text/csv',
|
705 |
+
)
|
706 |
+
|
707 |
+
|
708 |
+
# Create a button to download the DataFrame as CSV
|
709 |
+
#if st.button("Download Binned Data"):
|
710 |
+
# binned_csv = binned_df.to_csv(index=False)
|
711 |
+
# b64 = base64.b64encode(binned_csv.encode()).decode()
|
712 |
+
# href = f'<a href="data:file/csv;base64,{b64}" download="binned_data.csv">Download Binned Data CSV File</a>'
|
713 |
+
# st.markdown(href, unsafe_allow_html=True)
|
714 |
+
|
715 |
+
|
716 |
+
|
717 |
+
|
718 |
+
# def download_button(data, file_name, button_text):
|
719 |
+
# csv = data.to_csv(index=False).encode()
|
720 |
+
# href = f'<a href="data:file/csv;base64,{csv.decode()}" download="{file_name}">{button_text}</a>'
|
721 |
+
# st.markdown(href, unsafe_allow_html=True)
|
722 |
+
|
723 |
+
# # Add the download button
|
724 |
+
# download_button(binned_data_combined, 'data.csv', 'Download CSV')
|
725 |
+
|
726 |
+
|
727 |
+
|
728 |
+
|
729 |
+
|
730 |
+
|
731 |
+
|
732 |
+
|
733 |
+
|
734 |
+
|
735 |
+
|
736 |
+
# with st.expander("Profiling summary: Plots"):
|
737 |
+
# st.markdown(
|
738 |
+
# "<p class='plot-header'>Change the selected variable to plot"
|
739 |
+
# " different charts</p>",
|
740 |
+
# unsafe_allow_html=True,
|
741 |
+
# )
|
742 |
+
# st.write("Numerical binned data plots")
|
743 |
+
# for col in st.session_state.numerical_columns:
|
744 |
+
# if function_num == 'tree':
|
745 |
+
# depth = depth
|
746 |
+
# else:
|
747 |
+
# depth=None
|
748 |
+
# if function_num == 'value':
|
749 |
+
# num_bins=num_bins
|
750 |
+
# else:
|
751 |
+
# num_bins=None
|
752 |
+
# binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth)
|
753 |
+
# binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
|
754 |
+
# fig=plot_chart(binned_data_col, col, dep_var=None)
|
755 |
+
# st.plotly_chart(fig, use_container_width=False)
|
756 |
+
|
757 |
+
# st.write("Categorical binned data plots")
|
758 |
+
# for col in st.session_state.categorical_columns:
|
759 |
+
# if function_cat == 'woe_iv':
|
760 |
+
# max_thre = max_slider
|
761 |
+
# min_thre = min_slider
|
762 |
+
# no_of_bins = cat_bins_iv
|
763 |
+
# else:
|
764 |
+
# max_thre = None
|
765 |
+
# min_thre = None
|
766 |
+
# no_of_bins = None
|
767 |
+
# if function_cat == 'naive':
|
768 |
+
# no_of_bins = cat_bins_naive
|
769 |
+
# else:
|
770 |
+
# no_of_bins=None
|
771 |
+
# binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df,col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre,tolerence=2, flag='ignore')
|
772 |
+
# binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str))
|
773 |
+
# binned_data_col_cat.drop('column_name',axis=1,inplace=True)
|
774 |
+
# fig_cat = plot_chart(binned_data_col_cat, col, dep_var=None)
|
775 |
+
# st.plotly_chart(fig_cat, use_container_width=False)
|
pages/pages/3_Point estimates.py
ADDED
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
###### SUPER SAFE ######
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import streamlit as st
|
6 |
+
import pandas as pd
|
7 |
+
import numpy as np
|
8 |
+
import seaborn as sn
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
from sklearn.linear_model import LogisticRegression
|
11 |
+
from sklearn.preprocessing import MinMaxScaler, StandardScaler
|
12 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
13 |
+
from sklearn.model_selection import train_test_split
|
14 |
+
import xgboost as xgb
|
15 |
+
from sklearn.linear_model import LinearRegression
|
16 |
+
from sklearn.metrics import mean_squared_error, r2_score
|
17 |
+
from sklearn.decomposition import PCA
|
18 |
+
from sklearn.preprocessing import StandardScaler
|
19 |
+
import numpy as np
|
20 |
+
import plotly.figure_factory as ff
|
21 |
+
|
22 |
+
|
23 |
+
st.set_page_config(
|
24 |
+
layout="wide",
|
25 |
+
)
|
26 |
+
|
27 |
+
def point_estimates(df, model_type, flag, identifier, control_sample_size, solver=None, max_iter=None, class_weights=None, max_depth=None, subsample=None, eta=None):
|
28 |
+
# if set(df[df[flag] == 0][identifier]).intersection(set(df[df[flag] == 1][identifier])):
|
29 |
+
# st.error("The identifier should not be common between flag values 0 and 1.")
|
30 |
+
|
31 |
+
Xs = df.drop(columns=[identifier, flag],axis=1)
|
32 |
+
X_scaled = StandardScaler().fit_transform(Xs)
|
33 |
+
n_comp = len(Xs.columns)
|
34 |
+
pca = PCA(n_components=n_comp)
|
35 |
+
pca.fit(X_scaled)
|
36 |
+
princ_comp = pca.transform(X_scaled)
|
37 |
+
PCA_DF = pd.DataFrame(princ_comp)
|
38 |
+
pca_var = pca.explained_variance_ratio_[0:n_comp].cumsum()
|
39 |
+
idx = [i for i in range(len(pca_var)) if pca_var[i] > 0.995][0]
|
40 |
+
df_pca = PCA_DF.loc[:, 0:idx]
|
41 |
+
df_pca[flag]=df[flag]
|
42 |
+
print(df_pca)
|
43 |
+
#creating train and control datasets
|
44 |
+
df_train = df_pca[df_pca[flag] == 1]
|
45 |
+
df_control = df_pca[df_pca[flag] == 0]
|
46 |
+
df_control_sample = df_control.sample(n=control_sample_size, random_state=42)
|
47 |
+
final_df_sample = pd.concat([df_train, df_control_sample], ignore_index=True)
|
48 |
+
non_req_cols=[flag]
|
49 |
+
req_cols=df_pca.columns[~df_pca.columns.isin(non_req_cols)]
|
50 |
+
# create a holdout set
|
51 |
+
identifier_df, X, y = df[[identifier]], final_df_sample[req_cols], final_df_sample[[flag]]
|
52 |
+
if model_type == 'linear':
|
53 |
+
# scale features
|
54 |
+
# min_max_scaler = MinMaxScaler()
|
55 |
+
# X_norm = min_max_scaler.fit_transform(X)
|
56 |
+
#X_norm = (X - X.min()) / (X.max() - X.min())
|
57 |
+
# fit model
|
58 |
+
model = LogisticRegression(solver=solver, max_iter=max_iter, class_weight=class_weights)
|
59 |
+
model.fit(X, y)
|
60 |
+
#feature importances
|
61 |
+
coefs = model.coef_[0]
|
62 |
+
feats = X.columns
|
63 |
+
importance_df = pd.DataFrame({'features':feats, 'coefficients':coefs})
|
64 |
+
importance_df['abs_coef'] = np.abs(importance_df['coefficients'])
|
65 |
+
elif model_type == 'xgboost':
|
66 |
+
model = xgb.XGBClassifier(max_depth=max_depth, subsample=subsample, eta=eta)
|
67 |
+
model.fit(X, y)
|
68 |
+
importance = model.feature_importances_
|
69 |
+
feats = X.columns
|
70 |
+
importance_df = pd.DataFrame({'features':feats, 'Importance':importance})
|
71 |
+
|
72 |
+
#Prediction
|
73 |
+
Y_pred = model.predict(X)
|
74 |
+
#Confusion matrix
|
75 |
+
#cm = confusion_matrix(y, Y_pred)/y.shape[0]
|
76 |
+
cm = confusion_matrix(y, Y_pred) / len(y)
|
77 |
+
|
78 |
+
# Create DataFrame for confusion matrix
|
79 |
+
classes = np.unique(y)
|
80 |
+
df_cm = pd.DataFrame(cm, index=classes, columns=classes)
|
81 |
+
|
82 |
+
# Create hover text
|
83 |
+
hover_text = [['Actual: {}<br>Predicted: {}<br>Value: {:.2f}'.format(y.iloc[i, 0], Y_pred[i], cm[i, j])
|
84 |
+
for j in range(len(classes))] for i in range(len(classes))]
|
85 |
+
|
86 |
+
# Create heatmap using Plotly with hover text
|
87 |
+
fig = ff.create_annotated_heatmap(z=df_cm.values,
|
88 |
+
x=list(classes),
|
89 |
+
y=list(classes),
|
90 |
+
colorscale='blues',
|
91 |
+
hoverinfo='text',
|
92 |
+
text=hover_text)
|
93 |
+
|
94 |
+
# Update heatmap layout
|
95 |
+
fig.update_layout(
|
96 |
+
title='Confusion Matrix',
|
97 |
+
xaxis_title='Predicted',
|
98 |
+
yaxis_title='Actual',
|
99 |
+
font=dict(size=14)
|
100 |
+
)
|
101 |
+
|
102 |
+
# Display Plotly figure in Streamlit
|
103 |
+
#st.plotly_chart(fig)
|
104 |
+
#classification report
|
105 |
+
report = classification_report(y, Y_pred, output_dict=True)
|
106 |
+
# Convert the classification report to a DataFrame
|
107 |
+
report_df = pd.DataFrame(report).transpose()
|
108 |
+
# prep data
|
109 |
+
X, y = df_pca[req_cols], df_pca[[flag]]
|
110 |
+
#X, y = df.drop(columns=[flag,identifier]), df[[flag]]
|
111 |
+
# scale features
|
112 |
+
# min_max_scaler = MinMaxScaler()
|
113 |
+
# X_norm = min_max_scaler.fit_transform(X)
|
114 |
+
#X_norm = (X - X.min()) / (X.max() - X.min())
|
115 |
+
# run inference
|
116 |
+
y_pred_proba = model.predict_proba(X)
|
117 |
+
y_pred_df = pd.DataFrame(y_pred_proba)
|
118 |
+
df_pca.insert(0, 'propensity_score', y_pred_df[1])
|
119 |
+
# df_pca[identifier] = identifier_df
|
120 |
+
# df_pca[identifier]=df_pca[identifier].astype('str')
|
121 |
+
# Display classification report
|
122 |
+
st.subheader("Classification Report")
|
123 |
+
st.dataframe(report_df,width=600)
|
124 |
+
|
125 |
+
# Display confusion matrix
|
126 |
+
# st.subheader("Confusion Matrix")
|
127 |
+
# st.write(df_cm,width=600)
|
128 |
+
|
129 |
+
# Display confusion matrix
|
130 |
+
st.subheader("Confusion matrix")
|
131 |
+
st.plotly_chart(fig)
|
132 |
+
return df_pca[['propensity_score']]
|
133 |
+
|
134 |
+
|
135 |
+
|
136 |
+
# if 'df' in st.session_state:
|
137 |
+
# task_type = st.sidebar.selectbox("Task Type", ["classification", "regression"],key="task_type")
|
138 |
+
# model_type = st.sidebar.selectbox("Model Type", ["linear", "xgboost"])
|
139 |
+
# flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns))
|
140 |
+
# identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns))
|
141 |
+
# st.sidebar.write("Applicable only for Regression model type")
|
142 |
+
# dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
|
143 |
+
# st.session_state.flag=flag
|
144 |
+
# st.session_state.identifier=identifier
|
145 |
+
# # Sidebar for user inputs
|
146 |
+
# if flag is not None:
|
147 |
+
# with st.expander("Model Configuration", expanded=True):
|
148 |
+
# unique_flag_values = st.session_state.df[flag].unique()
|
149 |
+
# for value in unique_flag_values:
|
150 |
+
# st.write(f"Y == {value}: {len(st.session_state.df[st.session_state.df[flag] == value])}")
|
151 |
+
# control_sample_size = st.text_input("Control Sample Size")
|
152 |
+
|
153 |
+
# try:
|
154 |
+
# # Try converting to an integer
|
155 |
+
# control_sample_size = int(control_sample_size)
|
156 |
+
|
157 |
+
# # Check if control_sample_size is within the valid range
|
158 |
+
# flag_0_size = len(st.session_state.df[st.session_state.df[flag] == 0])
|
159 |
+
# if control_sample_size < 0 or control_sample_size > flag_0_size:
|
160 |
+
# st.error(f"Control Sample Size must be between 0 and {flag_0_size}.")
|
161 |
+
|
162 |
+
# except ValueError:
|
163 |
+
# st.error("Please enter a valid integer for Control Sample Size.")
|
164 |
+
|
165 |
+
|
166 |
+
# #st.write("Applicable only for Regression model type")
|
167 |
+
# #if st.session_state.get("task_type","") == "regression":
|
168 |
+
# #dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
|
169 |
+
# point_estimate_variable = st.text_input("Variable of interest")
|
170 |
+
# st.session_state.point_estimate_variable=point_estimate_variable
|
171 |
+
|
172 |
+
# if st.button("Run Modeling"):
|
173 |
+
# result_df = point_estimates(st.session_state.df, task_type, model_type, point_estimate_variable, control_sample_size, flag, identifier, dep_var)
|
174 |
+
|
175 |
+
# st.session_state.modeling_df = result_df
|
176 |
+
# st.session_state.treated_df=result_df[result_df['Y']==1]
|
177 |
+
# st.session_state.non_treated_df=result_df[result_df['Y']==0]
|
178 |
+
|
179 |
+
|
180 |
+
|
181 |
+
|
182 |
+
st.title("Algorithms")
|
183 |
+
|
184 |
+
#st.subheader("Classification") # Added line
|
185 |
+
#classification_option = st.radio("Classification", ["Classification"]) # Added line
|
186 |
+
|
187 |
+
if 'classification_option' not in st.session_state:
|
188 |
+
st.session_state.classification_option = "Classification"
|
189 |
+
if 'algorithm_option' not in st.session_state:
|
190 |
+
st.session_state.algorithm_option = "Logistic Regression"
|
191 |
+
|
192 |
+
classification_option = st.radio("Algorithm Type", ["Classification", "Regression"], key="classification_option")
|
193 |
+
|
194 |
+
if classification_option != st.session_state.classification_option:
|
195 |
+
st.session_state.classification_option = classification_option
|
196 |
+
|
197 |
+
if st.session_state.classification_option == "Classification":
|
198 |
+
col1, col2 = st.columns(2)
|
199 |
+
|
200 |
+
with col1:
|
201 |
+
st.write("#####")
|
202 |
+
lr_checkbox = st.checkbox(
|
203 |
+
label="Logistic Regression",
|
204 |
+
key="algorithm_lr_cb",
|
205 |
+
value=(st.session_state.algorithm_option == "Logistic Regression")
|
206 |
+
)
|
207 |
+
|
208 |
+
with col2:
|
209 |
+
st.write("#####")
|
210 |
+
show_lr_options = st.checkbox(
|
211 |
+
label="Change default options",
|
212 |
+
key="lr_options_cb",
|
213 |
+
disabled=not lr_checkbox,
|
214 |
+
)
|
215 |
+
|
216 |
+
cols = st.columns((2, 1))
|
217 |
+
with cols[0]:
|
218 |
+
lr_hyp_placeholder = st.empty()
|
219 |
+
lr_model_placeholder = st.empty()
|
220 |
+
|
221 |
+
solver='lbfgs'
|
222 |
+
class_weights=None
|
223 |
+
max_iter=1000
|
224 |
+
if show_lr_options and lr_checkbox:
|
225 |
+
with lr_hyp_placeholder:
|
226 |
+
with st.expander("LR parameters"):
|
227 |
+
solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
|
228 |
+
max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
|
229 |
+
class_weight_option = st.selectbox(
|
230 |
+
'Select class weights option:',
|
231 |
+
('Custom', 'Balanced')
|
232 |
+
)
|
233 |
+
|
234 |
+
if class_weight_option == 'Custom':
|
235 |
+
weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
|
236 |
+
weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
|
237 |
+
class_weights = {1: weight_1, 0: weight_0}
|
238 |
+
elif class_weight_option == 'Balanced':
|
239 |
+
class_weights = {1: 0.5, 0: 0.5}
|
240 |
+
#control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
|
241 |
+
|
242 |
+
col1, col2 = st.columns(2)
|
243 |
+
|
244 |
+
with col1:
|
245 |
+
st.write("#####")
|
246 |
+
xgb_checkbox = st.checkbox(
|
247 |
+
label="Xgboost Classifier", key="algorithm_xgb_cb",
|
248 |
+
value=(st.session_state.algorithm_option == "Xgboost Classifier")
|
249 |
+
)
|
250 |
+
|
251 |
+
with col2:
|
252 |
+
st.write("#####")
|
253 |
+
show_xgb_options = st.checkbox(
|
254 |
+
label="Change default options",
|
255 |
+
key="xgb_options_cb",
|
256 |
+
disabled=not xgb_checkbox,
|
257 |
+
)
|
258 |
+
|
259 |
+
cols = st.columns((2, 1))
|
260 |
+
with cols[0]:
|
261 |
+
xgb_hyp_placeholder = st.empty()
|
262 |
+
|
263 |
+
max_depth=None
|
264 |
+
subsample=None
|
265 |
+
eta=None
|
266 |
+
|
267 |
+
if show_xgb_options and xgb_checkbox:
|
268 |
+
with xgb_hyp_placeholder:
|
269 |
+
with st.expander("XGB hyper parameters"):
|
270 |
+
max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
|
271 |
+
subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
|
272 |
+
eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
|
273 |
+
#control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
|
274 |
+
st.session_state.algorithm_option = "Logistic Regression" if lr_checkbox else "Xgboost Classifier"
|
275 |
+
|
276 |
+
elif classification_option == "Regression":
|
277 |
+
col1, col2 = st.columns(2)
|
278 |
+
|
279 |
+
with col1:
|
280 |
+
st.write("#####")
|
281 |
+
lr_checkbox = st.checkbox(
|
282 |
+
label="Linear Regression",
|
283 |
+
key="algorithm_lr_cb",
|
284 |
+
value=(st.session_state.algorithm_option == "Linear Regression")
|
285 |
+
)
|
286 |
+
|
287 |
+
with col2:
|
288 |
+
st.write("#####")
|
289 |
+
show_lr_options = st.checkbox(
|
290 |
+
label="Change default options",
|
291 |
+
key="lr_options_cb",
|
292 |
+
disabled=not lr_checkbox,
|
293 |
+
)
|
294 |
+
|
295 |
+
cols = st.columns((2, 1))
|
296 |
+
with cols[0]:
|
297 |
+
lr_hyp_placeholder = st.empty()
|
298 |
+
lr_model_placeholder = st.empty()
|
299 |
+
|
300 |
+
solver='lbfgs'
|
301 |
+
class_weights=None
|
302 |
+
max_iter=1000
|
303 |
+
if show_lr_options and lr_checkbox:
|
304 |
+
with lr_hyp_placeholder:
|
305 |
+
with st.expander("LR parameters"):
|
306 |
+
solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
|
307 |
+
max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
|
308 |
+
class_weight_option = st.selectbox(
|
309 |
+
'Select class weights option:',
|
310 |
+
('Custom', 'Balanced')
|
311 |
+
)
|
312 |
+
|
313 |
+
if class_weight_option == 'Custom':
|
314 |
+
weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
|
315 |
+
weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
|
316 |
+
class_weights = {1: weight_1, 0: weight_0}
|
317 |
+
elif class_weight_option == 'Balanced':
|
318 |
+
class_weights = {1: 0.5, 0: 0.5}
|
319 |
+
|
320 |
+
col1, col2 = st.columns(2)
|
321 |
+
|
322 |
+
with col1:
|
323 |
+
st.write("#####")
|
324 |
+
xgb_checkbox = st.checkbox(
|
325 |
+
label="Xgboost Regression", key="algorithm_xgb_cb",
|
326 |
+
value=(st.session_state.algorithm_option == "Xgboost Regression")
|
327 |
+
)
|
328 |
+
|
329 |
+
with col2:
|
330 |
+
st.write("#####")
|
331 |
+
show_xgb_options = st.checkbox(
|
332 |
+
label="Change default options",
|
333 |
+
key="xgb_options_cb",
|
334 |
+
disabled=not xgb_checkbox,
|
335 |
+
)
|
336 |
+
|
337 |
+
cols = st.columns((2, 1))
|
338 |
+
with cols[0]:
|
339 |
+
xgb_hyp_placeholder = st.empty()
|
340 |
+
|
341 |
+
max_depth=None
|
342 |
+
subsample=None
|
343 |
+
eta=None
|
344 |
+
|
345 |
+
if show_xgb_options and xgb_checkbox:
|
346 |
+
with xgb_hyp_placeholder:
|
347 |
+
with st.expander("XGB hyper parameters"):
|
348 |
+
max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
|
349 |
+
subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
|
350 |
+
eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
|
351 |
+
st.session_state.algorithm_option = "Linear Regression" if lr_checkbox else "Xgboost Regression"
|
352 |
+
|
353 |
+
with cols[0]:
|
354 |
+
control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
|
355 |
+
|
356 |
+
#st.subheader("Classification") # Added line
|
357 |
+
#classification_option = st.radio("Classification", ["Classification"]) # Added line
|
358 |
+
|
359 |
+
if st.button("Run Modeling"):
|
360 |
+
if lr_checkbox:
|
361 |
+
st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='linear',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,solver=solver,max_iter=max_iter,class_weights=class_weights)
|
362 |
+
elif xgb_checkbox:
|
363 |
+
st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='xgboost',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,max_depth=max_depth, subsample=subsample, eta=eta)
|
364 |
+
|
365 |
+
|
366 |
+
# st.session_state.binned_df['propensity_score'] = result_df['propensity_score']
|
367 |
+
st.session_state.treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 1]
|
368 |
+
st.session_state.non_treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 0]
|
369 |
+
|
pages/pages/4_Matching & Diagnostics.py
ADDED
@@ -0,0 +1,490 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from sklearn.neighbors import NearestNeighbors
|
5 |
+
from sklearn.preprocessing import StandardScaler
|
6 |
+
import xgboost as xgb
|
7 |
+
import base64
|
8 |
+
import streamlit as st
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
from sklearn.preprocessing import StandardScaler
|
13 |
+
from sklearn.neighbors import NearestNeighbors
|
14 |
+
from math import sqrt
|
15 |
+
from statistics import mean, variance
|
16 |
+
import seaborn as sns
|
17 |
+
|
18 |
+
import plotly.graph_objects as go
|
19 |
+
|
20 |
+
def cohend_plot_function(std_mean_diff_df2, std_mean_diff_df, selected_attributes):
|
21 |
+
# Create subplot of selected attributes
|
22 |
+
fig = go.Figure()
|
23 |
+
|
24 |
+
x = std_mean_diff_df2[std_mean_diff_df2["Metrics"].isin(selected_attributes)]["Cohend Value"][::-1]
|
25 |
+
y = list(std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Metrics"][::-1])
|
26 |
+
|
27 |
+
x1 = std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Cohend Value"][::-1]
|
28 |
+
y1 = list(std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Metrics"][::-1])
|
29 |
+
|
30 |
+
# Add traces
|
31 |
+
fig.add_trace(go.Scatter(
|
32 |
+
x=x,
|
33 |
+
y=y,
|
34 |
+
mode='markers',
|
35 |
+
marker=dict(color='blue'),
|
36 |
+
name='general_control_cohend'
|
37 |
+
))
|
38 |
+
|
39 |
+
fig.add_trace(go.Scatter(
|
40 |
+
x=x1,
|
41 |
+
y=y1,
|
42 |
+
mode='markers',
|
43 |
+
marker=dict(color='orange', symbol='diamond-open'),
|
44 |
+
name='synthetic_control_cohend'
|
45 |
+
))
|
46 |
+
|
47 |
+
# Add vertical lines
|
48 |
+
for val in [-0.1, 0.1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75]:
|
49 |
+
fig.add_shape(
|
50 |
+
type="line",
|
51 |
+
x0=val,
|
52 |
+
y0=0,
|
53 |
+
x1=val,
|
54 |
+
y1=10,
|
55 |
+
line=dict(
|
56 |
+
color="gray",
|
57 |
+
width=1,
|
58 |
+
dash="dash",
|
59 |
+
)
|
60 |
+
)
|
61 |
+
|
62 |
+
# Add vertical line at x=0
|
63 |
+
fig.add_shape(
|
64 |
+
type="line",
|
65 |
+
x0=0,
|
66 |
+
y0=0,
|
67 |
+
x1=0,
|
68 |
+
y1=10,
|
69 |
+
line=dict(
|
70 |
+
color="black",
|
71 |
+
width=1,
|
72 |
+
)
|
73 |
+
)
|
74 |
+
|
75 |
+
# Update layout
|
76 |
+
fig.update_layout(
|
77 |
+
xaxis=dict(
|
78 |
+
title='cohend',
|
79 |
+
range=[-1, 1]
|
80 |
+
),
|
81 |
+
yaxis=dict(
|
82 |
+
title='Metrics',
|
83 |
+
autorange="reversed"
|
84 |
+
),
|
85 |
+
legend=dict(
|
86 |
+
orientation="h",
|
87 |
+
yanchor="bottom",
|
88 |
+
y=1.02,
|
89 |
+
xanchor="right",
|
90 |
+
x=1
|
91 |
+
)
|
92 |
+
)
|
93 |
+
|
94 |
+
# Show
|
95 |
+
st.plotly_chart(fig,use_container_width=True)
|
96 |
+
|
97 |
+
|
98 |
+
def plot_comparison(comparison_df):
|
99 |
+
fig = go.Figure()
|
100 |
+
|
101 |
+
# Add bars for treatment and control values
|
102 |
+
fig.add_trace(go.Bar(
|
103 |
+
x=comparison_df.index,
|
104 |
+
y=comparison_df[comparison_df.columns[0]],
|
105 |
+
name='Treatment',
|
106 |
+
marker=dict(color='#053057'),
|
107 |
+
))
|
108 |
+
|
109 |
+
fig.add_trace(go.Bar(
|
110 |
+
x=comparison_df.index,
|
111 |
+
y=comparison_df[comparison_df.columns[1]],
|
112 |
+
name='Control',
|
113 |
+
marker=dict(color='#8ac4f8'),
|
114 |
+
))
|
115 |
+
|
116 |
+
# Update layout
|
117 |
+
fig.update_layout(
|
118 |
+
xaxis=dict(
|
119 |
+
title='quartiles'
|
120 |
+
),
|
121 |
+
yaxis=dict(
|
122 |
+
title='values'
|
123 |
+
),
|
124 |
+
barmode='group',
|
125 |
+
title=comparison_df.columns[0].split('treatment')[1][1:]
|
126 |
+
)
|
127 |
+
|
128 |
+
# Show
|
129 |
+
st.plotly_chart(fig,use_container_width=True)
|
130 |
+
|
131 |
+
|
132 |
+
def plot_propensity_distribution(treatment_data, control_data):
|
133 |
+
fig = go.Figure()
|
134 |
+
|
135 |
+
# Add histograms for treatment and control data
|
136 |
+
fig.add_trace(go.Histogram(
|
137 |
+
x=treatment_data,
|
138 |
+
name='Treatment',
|
139 |
+
marker=dict(color='#053057'),
|
140 |
+
opacity=0.6
|
141 |
+
))
|
142 |
+
|
143 |
+
fig.add_trace(go.Histogram(
|
144 |
+
x=control_data,
|
145 |
+
name='Control',
|
146 |
+
marker=dict(color='#8ac4f8'),
|
147 |
+
opacity=0.6
|
148 |
+
))
|
149 |
+
|
150 |
+
# Update layout
|
151 |
+
fig.update_layout(
|
152 |
+
xaxis=dict(
|
153 |
+
title='propensity_score'
|
154 |
+
),
|
155 |
+
yaxis=dict(
|
156 |
+
title='count'
|
157 |
+
),
|
158 |
+
barmode='overlay',
|
159 |
+
title='Propensity Distribution'
|
160 |
+
)
|
161 |
+
|
162 |
+
# Show
|
163 |
+
st.plotly_chart(fig,use_container_width=True)
|
164 |
+
|
165 |
+
def comparison(df, variable):
|
166 |
+
# generates a comparison df for any given feature
|
167 |
+
treatment_values = df[df.Y==1].groupby('quartiles')[variable].mean()
|
168 |
+
control_values = df[df.Y==0].groupby('quartiles')[variable].mean()
|
169 |
+
comparison = pd.merge(treatment_values, control_values, left_index=True, right_index=True)
|
170 |
+
comparison.rename({f'{variable}_x': f'treatment_{variable}', f'{variable}_y': f'control_{variable}'}, axis=1, inplace=True)
|
171 |
+
comparison['difference'] = np.abs(comparison[f'treatment_{variable}'] - comparison[f'control_{variable}'])
|
172 |
+
comparison['percent_difference'] = np.abs((comparison[f'treatment_{variable}'] - comparison[f'control_{variable}']) / comparison[f'treatment_{variable}'])
|
173 |
+
return comparison
|
174 |
+
|
175 |
+
|
176 |
+
# Function to calculate Cohen's d for independent samples
|
177 |
+
|
178 |
+
def cohend(d1, d2):
|
179 |
+
n1, n2 = len(d1), len(d2)
|
180 |
+
s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
|
181 |
+
s = sqrt(((n1-1) * s1 + (n2-1) * s2) / (n1 + n2 - 2))
|
182 |
+
u1, u2 = mean(d1), mean(d2)
|
183 |
+
# Check if the standard deviation is zero
|
184 |
+
if s == 0:
|
185 |
+
return 0 # Return 0 when the denominator is zero
|
186 |
+
else:
|
187 |
+
return (u1 - u2) / s
|
188 |
+
|
189 |
+
# Function to calculate standardized mean differences
|
190 |
+
def std_mean_diff(group_A_df, group_B_df):
|
191 |
+
cohend_values_arr = [0] * len(group_A_df.columns)
|
192 |
+
|
193 |
+
for i in range(len(group_A_df.columns)):
|
194 |
+
cohend_values_arr[i] = cohend(group_A_df[group_A_df.columns[i]], group_B_df[group_A_df.columns[i]])
|
195 |
+
|
196 |
+
cohend_array_pre_transp = [group_A_df.columns, cohend_values_arr]
|
197 |
+
np_array = np.array(cohend_array_pre_transp)
|
198 |
+
cohend_array = np.transpose(np_array)
|
199 |
+
|
200 |
+
return cohend_array
|
201 |
+
|
202 |
+
# Function to get matched IDs and calculate Cohen's d values
|
203 |
+
def cohend_code_function(binned_df, matching_df):
|
204 |
+
treat_df_complete = binned_df[binned_df['Y'] == 1]
|
205 |
+
control_df_complete = binned_df[binned_df['Y'] == 0]
|
206 |
+
treat_df_complete.drop('Y', axis =1, inplace = True)
|
207 |
+
control_df_complete.drop('Y', axis =1, inplace = True)
|
208 |
+
treatment_cust = pd.DataFrame()
|
209 |
+
control_cust = pd.DataFrame()
|
210 |
+
treatment_cust['individual_id_ov'] = matching_df["Id"]
|
211 |
+
control_cust['individual_id_ov'] = matching_df["matched_Id"]
|
212 |
+
|
213 |
+
#getting cohend values for synthetic control population
|
214 |
+
|
215 |
+
group_A_df = treatment_cust[['individual_id_ov']]
|
216 |
+
group_A_df = group_A_df.merge(treat_df_complete,
|
217 |
+
how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
|
218 |
+
group_B_df = control_cust[['individual_id_ov']]
|
219 |
+
group_B_df = group_B_df.merge(control_df_complete,
|
220 |
+
how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
|
221 |
+
|
222 |
+
group_A_df.drop('individual_id_ov', axis =1, inplace = True)
|
223 |
+
group_B_df.drop('individual_id_ov', axis =1, inplace = True)
|
224 |
+
|
225 |
+
cohensd_df = std_mean_diff(group_A_df, group_B_df)
|
226 |
+
std_mean_diff_df = pd.DataFrame(columns=["Metrics","Cohend Value"])
|
227 |
+
for i in range(len(cohensd_df)):
|
228 |
+
std_mean_diff_df.loc[len(std_mean_diff_df.index)] = [cohensd_df[i][0],round(float(cohensd_df[i][1]),2)]
|
229 |
+
|
230 |
+
std_mean_diff_df["flag"] = std_mean_diff_df.apply(lambda x : 1 if (x["Cohend Value"]>0.1 or x["Cohend Value"]<-0.1) else 0, axis =1)
|
231 |
+
st.write('Number of variables with standard mean difference between treatment and control is out of desired range (-0.1, 0.1): ', std_mean_diff_df["flag"].sum())
|
232 |
+
|
233 |
+
|
234 |
+
# Download cohend output table
|
235 |
+
st.write(std_mean_diff_df)
|
236 |
+
|
237 |
+
#getting cohend values for General population
|
238 |
+
|
239 |
+
group_A_df = treatment_cust[['individual_id_ov']]
|
240 |
+
group_A_df = group_A_df.merge(treat_df_complete,
|
241 |
+
how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
|
242 |
+
group_B_df = control_df_complete[['individual_id_ov']]
|
243 |
+
group_B_df = group_B_df.merge(control_df_complete,
|
244 |
+
how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
|
245 |
+
|
246 |
+
group_A_df.drop('individual_id_ov', axis =1, inplace = True)
|
247 |
+
group_B_df.drop('individual_id_ov', axis =1, inplace = True)
|
248 |
+
|
249 |
+
cohensd_df = std_mean_diff(group_A_df, group_B_df)
|
250 |
+
|
251 |
+
std_mean_diff_df2 = pd.DataFrame(columns=["Metrics","Cohend Value"])
|
252 |
+
|
253 |
+
for i in range(len(cohensd_df)):
|
254 |
+
std_mean_diff_df2.loc[len(std_mean_diff_df2.index)] = [cohensd_df[i][0],round(float(cohensd_df[i][1]),2)]
|
255 |
+
|
256 |
+
return std_mean_diff_df2, std_mean_diff_df
|
257 |
+
|
258 |
+
def calculate_iv(df, flag, identifier):
|
259 |
+
df1 = df.drop([flag, identifier, 'propensity_score'], axis=1)
|
260 |
+
iv_df = pd.DataFrame(columns=['Feature', 'IV'])
|
261 |
+
for column in df1.columns:
|
262 |
+
data = pd.concat([pd.qcut(df1[column], q=10, duplicates='drop'), df[flag]], axis=1)
|
263 |
+
groups = data.groupby(by=column)[df[flag].name].agg(['count', 'sum'])
|
264 |
+
groups['event_rate'] = groups['sum'] / groups['count']
|
265 |
+
groups['non_event_rate'] = (groups['count'] - groups['sum']) / groups['count']
|
266 |
+
groups['WOE'] = np.log(groups['event_rate'] / groups['non_event_rate'])
|
267 |
+
groups['IV'] = (groups['event_rate'] - groups['non_event_rate']) * groups['WOE']
|
268 |
+
iv = groups['IV'].sum()
|
269 |
+
iv_df = pd.concat([iv_df, pd.DataFrame({'Feature': [column], 'IV': [iv]})],axis=0, ignore_index=True)
|
270 |
+
return iv_df
|
271 |
+
|
272 |
+
def xgboost_feature_importance(df, flag,identifier):
|
273 |
+
X, y = df.drop([flag,identifier,'propensity_score'],axis=1), df[[flag]]
|
274 |
+
model = xgb.XGBClassifier()
|
275 |
+
model.fit(X, y)
|
276 |
+
importances = model.feature_importances_
|
277 |
+
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
|
278 |
+
importance_df = importance_df.sort_values(by='Importance', ascending=False)
|
279 |
+
return importance_df
|
280 |
+
|
281 |
+
# iv_result = calculate_iv(df_features, df_target)
|
282 |
+
# importance_result = xgboost_feature_importance(df_features, df_target)
|
283 |
+
|
284 |
+
|
285 |
+
def get_matching_pairs(identifier,treated_df, non_treated_df, sample_size_A, sample_size_B,matching_columns,flag):
|
286 |
+
# if treated_df[identifier].isna().any() or non_treated_df[identifier].isna().any():
|
287 |
+
# st.error("The identifier should not contain Nan's")
|
288 |
+
|
289 |
+
treated_df = treated_df[matching_columns].sample(frac=sample_size_A/100)
|
290 |
+
non_treated_df = non_treated_df[matching_columns].sample(frac=sample_size_B/100)
|
291 |
+
|
292 |
+
treated_df = treated_df.set_index(st.session_state.identifier)
|
293 |
+
treated_df.drop(flag,axis=1,inplace=True)
|
294 |
+
|
295 |
+
non_treated_df = non_treated_df.set_index(st.session_state.identifier)
|
296 |
+
non_treated_df.drop(flag,axis=1,inplace=True)
|
297 |
+
|
298 |
+
treated_x = treated_df.values
|
299 |
+
non_treated_x = non_treated_df.values
|
300 |
+
|
301 |
+
scaler = StandardScaler()
|
302 |
+
scaler.fit(treated_x)
|
303 |
+
treated_x = scaler.transform(treated_x)
|
304 |
+
non_treated_x = scaler.transform(non_treated_x)
|
305 |
+
|
306 |
+
|
307 |
+
print("data transformaion completed")
|
308 |
+
|
309 |
+
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(non_treated_x)
|
310 |
+
|
311 |
+
print("model fitting completed")
|
312 |
+
|
313 |
+
distances, indices = nbrs.kneighbors(treated_x)
|
314 |
+
|
315 |
+
print("matching completed")
|
316 |
+
|
317 |
+
indices = indices.reshape([1,indices.shape[0]*indices.shape[1]])
|
318 |
+
|
319 |
+
res = []
|
320 |
+
for i in list(treated_df.index):
|
321 |
+
for ele in range(1):
|
322 |
+
res.append(i)
|
323 |
+
|
324 |
+
|
325 |
+
output_df = pd.DataFrame()
|
326 |
+
output_df["Id"] = res
|
327 |
+
output_df["matched_Id"] = non_treated_df.iloc[indices[0]].index
|
328 |
+
|
329 |
+
return output_df
|
330 |
+
|
331 |
+
# Streamlit App
|
332 |
+
st.title("Matching")
|
333 |
+
|
334 |
+
# Calculate IV
|
335 |
+
iv_df = calculate_iv(st.session_state.binned_df, st.session_state.flag, st.session_state.identifier)
|
336 |
+
|
337 |
+
# Calculate XGBoost feature importance
|
338 |
+
importance_df = xgboost_feature_importance(st.session_state.binned_df, st.session_state.flag, st.session_state.identifier)
|
339 |
+
|
340 |
+
# Combine IV and feature importance into a final DataFrame
|
341 |
+
combined_df = pd.merge(iv_df, importance_df, on='Feature', suffixes=('_iv', '_importance'))
|
342 |
+
combined_df['Avg_IV_Importance'] = (combined_df['IV'] + combined_df['Importance']) / 2
|
343 |
+
combined_df.sort_values('Avg_IV_Importance',inplace=True,ascending=False)
|
344 |
+
# Add the 'Select' column with checkboxes
|
345 |
+
combined_df.insert(0, 'Select', False)
|
346 |
+
combined_df.reset_index(drop=True,inplace=True)
|
347 |
+
|
348 |
+
# Display the feature importances
|
349 |
+
st.subheader("Feature importances")
|
350 |
+
st.session_state["edited_df_combined"] = st.data_editor(
|
351 |
+
combined_df.style.hide(axis="index"),
|
352 |
+
column_config={
|
353 |
+
"Select": st.column_config.CheckboxColumn(required=True)
|
354 |
+
},
|
355 |
+
disabled=combined_df.drop("Select", axis=1).columns,use_container_width=True
|
356 |
+
)
|
357 |
+
|
358 |
+
# Allow users to enter the number of top features they want to select
|
359 |
+
top_features_input = st.number_input("Enter the number of top features", min_value=1, max_value=len(combined_df), value=None)
|
360 |
+
|
361 |
+
if top_features_input is not None:
|
362 |
+
# Select the top features based on user input
|
363 |
+
selected_df = combined_df.head(top_features_input)
|
364 |
+
selected_features = selected_df['Feature'].tolist()
|
365 |
+
else:
|
366 |
+
# Check if any features are selected via checkboxes
|
367 |
+
selected_features = st.session_state.edited_df_combined[st.session_state.edited_df_combined['Select']]['Feature'].tolist()
|
368 |
+
|
369 |
+
# Determine the selected features based on user input
|
370 |
+
#selected_features = checkbox_selected_features if checkbox_selected_features else selected_features
|
371 |
+
|
372 |
+
selected_features.append(st.session_state.identifier)
|
373 |
+
selected_features.append(st.session_state.flag)
|
374 |
+
# Update the session state with the selected features
|
375 |
+
st.session_state.selected_features = selected_features
|
376 |
+
|
377 |
+
with st.expander("Matching Inputs",expanded=True):
|
378 |
+
st.write("Matching Inputs")
|
379 |
+
ui_columns = st.columns((1, 1))
|
380 |
+
with ui_columns[0]:
|
381 |
+
sample_size_A = st.slider("Sample Size for treatment Group", 1, 100, 100)
|
382 |
+
with ui_columns[1]:
|
383 |
+
sample_size_B = st.slider("Sample Size for Control Group", 1, 100, 100)
|
384 |
+
with ui_columns[0]:
|
385 |
+
st.write("#")
|
386 |
+
run_matching = st.button(
|
387 |
+
label="Run Matching"
|
388 |
+
)
|
389 |
+
st.divider()
|
390 |
+
if run_matching:
|
391 |
+
matching_df = get_matching_pairs(st.session_state.identifier,st.session_state.treated_df, st.session_state.non_treated_df, sample_size_A, sample_size_B,st.session_state.selected_features,st.session_state.flag)
|
392 |
+
st.session_state.matching_df = matching_df
|
393 |
+
# Display the result
|
394 |
+
st.dataframe(st.session_state.matching_df)
|
395 |
+
if st.session_state.matching_df is not None:
|
396 |
+
#with st.expander("Download Matching DF"):
|
397 |
+
download_button = st.download_button(
|
398 |
+
label="Download Matched Data as CSV",
|
399 |
+
data=st.session_state.matching_df.to_csv(index=False).encode(),
|
400 |
+
file_name='matching_data.csv',
|
401 |
+
mime='text/csv',
|
402 |
+
)
|
403 |
+
|
404 |
+
# if 'matching_df' not in st.session_state:
|
405 |
+
# st.session_state.matching_df = False
|
406 |
+
|
407 |
+
st.subheader("Matching diagnostics")
|
408 |
+
control_group = st.session_state.binned_df[st.session_state.binned_df[st.session_state.identifier].isin(st.session_state.matching_df['matched_Id'])]
|
409 |
+
treatment_group = st.session_state.binned_df[st.session_state.binned_df.Y==1]
|
410 |
+
|
411 |
+
#create combined group and add ventiles
|
412 |
+
combined_group = pd.concat([control_group, treatment_group])
|
413 |
+
combined_group['quartiles'] = pd.qcut(combined_group['propensity_score'], 4, labels=False)
|
414 |
+
|
415 |
+
combined_group.drop(st.session_state.identifier,axis=1,inplace=True)
|
416 |
+
st.session_state.combined_group=combined_group
|
417 |
+
|
418 |
+
if 'perform_diagnostics' not in st.session_state:
|
419 |
+
st.session_state.perform_diagnostics = False
|
420 |
+
|
421 |
+
# Display button
|
422 |
+
perform_diagnostics = st.button(label="Run Diagnostics")
|
423 |
+
|
424 |
+
if perform_diagnostics or st.session_state.perform_diagnostics:
|
425 |
+
st.session_state.perform_diagnostics = True
|
426 |
+
with st.expander("Matching Diagnostics", expanded=True):
|
427 |
+
left, right = st.columns(2)
|
428 |
+
std_mean_diff_df2,std_mean_diff_df = cohend_code_function(st.session_state.binned_df, st.session_state.matching_df)
|
429 |
+
st.subheader("Cohen's d Plot")
|
430 |
+
cohend_plot_function(std_mean_diff_df2,std_mean_diff_df, selected_features)
|
431 |
+
|
432 |
+
# Pre-matching Propensity Distribution
|
433 |
+
st.subheader("Pre-matching Propensity Distributions")
|
434 |
+
plot_propensity_distribution(st.session_state.binned_df[st.session_state.binned_df.Y == 1]['propensity_score'], st.session_state.binned_df[st.session_state.binned_df.Y == 0]['propensity_score'])
|
435 |
+
|
436 |
+
# Post-matching Propensity Distribution
|
437 |
+
st.subheader("Post-matching Propensity Distributions")
|
438 |
+
temp = pd.merge(left=st.session_state.matching_df, right=st.session_state.binned_df[[st.session_state.identifier, 'propensity_score']], left_on='Id', right_on=st.session_state.identifier, how='left')
|
439 |
+
temp.drop(st.session_state.identifier, axis=1, inplace=True)
|
440 |
+
temp.rename({'Id': 'treatment_id', 'matched_Id': 'control_id', 'propensity_score': 'treatment_propensity'}, axis=1, inplace=True)
|
441 |
+
temp = pd.merge(left=temp, right=st.session_state.binned_df[[st.session_state.identifier, 'propensity_score']], left_on='control_id', right_on=st.session_state.identifier, how='left')
|
442 |
+
temp.drop(st.session_state.identifier, axis=1, inplace=True)
|
443 |
+
temp.rename({'propensity_score': 'control_propensity'}, axis=1, inplace=True)
|
444 |
+
|
445 |
+
plot_propensity_distribution(temp['treatment_propensity'],temp['control_propensity'])
|
446 |
+
|
447 |
+
|
448 |
+
|
449 |
+
with st.expander("Comparison Plots",expanded=True):
|
450 |
+
st.markdown(
|
451 |
+
"<p class='plot-header'>Change the selected variable to plot"
|
452 |
+
" different charts</p>",
|
453 |
+
unsafe_allow_html=True,
|
454 |
+
)
|
455 |
+
left, right = st.columns(2)
|
456 |
+
with left:
|
457 |
+
if 'selected_variable_comp' not in st.session_state:
|
458 |
+
st.session_state.selected_variable_comp = [] # Initialize selected_variable
|
459 |
+
|
460 |
+
selected_variable_comp = st.multiselect(
|
461 |
+
"Variable",
|
462 |
+
st.session_state.combined_group.columns,
|
463 |
+
st.session_state.selected_variable_comp # Set the default value to the stored session state
|
464 |
+
)
|
465 |
+
|
466 |
+
# Update session state with selected variable
|
467 |
+
st.session_state.selected_variable_comp = selected_variable_comp
|
468 |
+
|
469 |
+
if st.session_state.selected_variable_comp:
|
470 |
+
# Plot comparisons for selected variables
|
471 |
+
comparisons = {}
|
472 |
+
for var in st.session_state.selected_variable_comp:
|
473 |
+
comparisons[var] = comparison(combined_group, var)
|
474 |
+
plot_comparison(comparisons[var])
|
475 |
+
|
476 |
+
|
477 |
+
# selected_variables = st.multiselect("Select variables for comparison", combined_group.columns)
|
478 |
+
# if selected_variables:
|
479 |
+
# # Plot comparisons for selected variables
|
480 |
+
# comparisons = {}
|
481 |
+
# for var in selected_variables:
|
482 |
+
# comparisons[var] = comparison(combined_group, var)
|
483 |
+
# plot_comparison(comparisons[var])
|
484 |
+
|
485 |
+
|
486 |
+
|
487 |
+
|
488 |
+
|
489 |
+
|
490 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dash==2.9.3
|
2 |
+
dash_auth==2.0.0
|
3 |
+
dash_bootstrap_components==1.4.1
|
4 |
+
holidays==0.24
|
5 |
+
hyperopt==0.2.7
|
6 |
+
joblib==1.2.0
|
7 |
+
matplotlib==3.5.1
|
8 |
+
mdutils==1.5.0
|
9 |
+
numpy==1.22.4
|
10 |
+
openpyxl==3.0.10
|
11 |
+
openpyxl_image_loader==1.0.5
|
12 |
+
pandas==1.5.2
|
13 |
+
# Pillow==9.4.0
|
14 |
+
Pillow==10.2.0
|
15 |
+
plotly==5.14.1
|
16 |
+
pmdarima==2.0.2
|
17 |
+
prophet==1.1.2
|
18 |
+
python-dotenv==1.0.0
|
19 |
+
# pytz==2022.7.1
|
20 |
+
pytz==2022.7
|
21 |
+
scikit_learn==1.2.2
|
22 |
+
scipy==1.7.3
|
23 |
+
seaborn==0.11.2
|
24 |
+
shap==0.41.0
|
25 |
+
statsmodels==0.13.5
|
26 |
+
streamlit==1.27.2
|
27 |
+
streamlit-aggrid==0.3.4.post3
|
28 |
+
sweetviz==2.3.1
|
29 |
+
waitress==2.1.2
|
30 |
+
xgboost==1.6.2
|
styles.css
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
html {
|
2 |
+
margin: 0;
|
3 |
+
}
|
4 |
+
|
5 |
+
#MainMenu {
|
6 |
+
|
7 |
+
visibility: collapse;
|
8 |
+
}
|
9 |
+
|
10 |
+
footer {
|
11 |
+
visibility: collapse;
|
12 |
+
}
|
13 |
+
|
14 |
+
div.block-container{
|
15 |
+
padding: 2rem 3rem;
|
16 |
+
}
|
17 |
+
|
18 |
+
|
19 |
+
.main-header {
|
20 |
+
display: flex;
|
21 |
+
flex-direction: row;
|
22 |
+
justify-content: space-between;
|
23 |
+
align-items: center;
|
24 |
+
}
|
25 |
+
.main-header > img {
|
26 |
+
max-height: 96px;
|
27 |
+
/* max-width: 300px; */
|
28 |
+
object-fit: cover;
|
29 |
+
}
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
button div {
|
34 |
+
overflow: hidden;
|
35 |
+
text-overflow:ellipsis;
|
36 |
+
white-space: nowrap;
|
37 |
+
}
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
h1 {
|
42 |
+
color: #053057;
|
43 |
+
}
|
44 |
+
|
45 |
+
hr {
|
46 |
+
height: 10px !important;
|
47 |
+
color: #053057;
|
48 |
+
}
|
49 |
+
|
50 |
+
p.plot-header {
|
51 |
+
font-size: small;
|
52 |
+
font-weight: bold;
|
53 |
+
}
|
54 |
+
|
55 |
+
hr {
|
56 |
+
margin: 0 0 10 0;
|
57 |
+
padding: 0;
|
58 |
+
}
|