Spaces:

BlendMMM
/

SCM

Running

App Files Files Community

Manoj commited on Nov 7, 2024

Commit

a9415a6

•

1 Parent(s): 87ec425

first

Browse files

Files changed (9) hide show

Home.py +96 -0
README.md +1 -1
logo.png +0 -0
pages/pages/1_Imputations.py +415 -0
pages/pages/2_Profiling.py +775 -0
pages/pages/3_Point estimates.py +369 -0
pages/pages/4_Matching & Diagnostics.py +490 -0
requirements.txt +30 -0
styles.css +58 -0

Home.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import streamlit as st
+import pandas as pd
+import os
+import base64
+from pathlib import Path
+path = os.path.dirname(__file__)
+file_ = open(f"{path}/logo.png", "rb")
+contents = file_.read()
+data_url = base64.b64encode(contents).decode("utf-8")
+file_.close()
+def load_local_css(file_name):
+    with open(file_name) as f:
+        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
+def set_header():
+    return st.markdown(
+        f"""<div class='main-header'>
+                    <h1>Synthetic Control</h1>
+                    <img src="data:image;base64,{data_url}", alt="Logo">
+            </div>""",
+        unsafe_allow_html=True,
+    )
+st.set_page_config(layout="wide")
+load_local_css("styles.css")
+set_header()
+st.title("Input data")
+data_file = st.file_uploader(
+    label="Choose a file",
+    accept_multiple_files=False,
+    key="user_upload_file",
+    type=["csv", "xlsx"]
+)
+info_placeholder = st.empty()
+if data_file:
+    #df = pd.read_csv(data_file,dtype = {'individual_id_ov':str})
+    dtype={'individual_id_ov':'str',
+            'past_3month_GMV_GMA':'float64',
+            'past_3month_qty_GMA':'int64',
+            'past_3month_orders_GMA':'int64',
+            'past_6month_GMV_GMA':'float64',
+            'past_6month_qty_GMA':'int64',
+            'past_6month_orders_GMA':'int64',
+            'past_9month_GMV_GMA':'float64',
+            'past_9month_qty_GMA':'int64',
+            'past_9month_orders_GMA':'int64',
+            'past_12month_GMV_GMA':'float64',
+            'past_12month_qty_GMA':'int64',
+            'past_12month_orders_GMA':'int64',
+            'avg_order_gap_between_GMA_purchases':'float64',
+            'days_since_last_GMA_purchase':'float64',
+            'age':'float64',
+            'gender':'str',
+            'income_group':'str',
+            'age_group':'str',
+            'urbanicity':'str',
+            'ethnicity':'str',
+            'Kids':'str',
+            'hh_size_excl_child':'str',
+            'hh_adult_qty':'float64',
+            'hh_scs_est_per1000_income_amt':'float64',
+            'avg_order_gap_between_WMT_purchases':'float64',
+            'days_since_last_WMT_purchase':'float64',
+            'Y':'int64'}
+    df = pd.read_excel(data_file, sheet_name='sheet1', dtype=dtype,engine='openpyxl')
+    st.session_state.df = df
+    st.write(df.head())
+    with info_placeholder:
+        st.success("File upload successful")
+    plot_df=pd.read_excel(data_file, sheet_name='sheet2')
+    st.session_state.plot_df = plot_df
+# start_date = st.date_input("Start date")
+# end_date = st.date_input("End date")
+# # Show the selected date range
+# st.write("Selected date range:", start_date, "to", end_date)
+# uploaded_file = st.file_uploader("Choose a file")
+# if uploaded_file is not None:
+#     df=pd.read_csv(uploaded_file,dtype = {'individual_id_ov':str})
+#     st.session_state.df = df
+#     st.success("File upload successful, here is the data preview")
+#     st.write(df.head())

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: indigo
 colorTo: yellow
 sdk: streamlit
 sdk_version: 1.40.0
-app_file: app.py
 pinned: false
 ---

 colorTo: yellow
 sdk: streamlit
 sdk_version: 1.40.0
+app_file: Home.py
 pinned: false
 ---

logo.png ADDED Viewed

pages/pages/1_Imputations.py ADDED Viewed

	@@ -0,0 +1,415 @@

+##### SAFE IMPUTATION #####
+import pandas as pd
+import numpy as np
+from scipy import stats
+import warnings
+import streamlit as st
+import base64
+def outlier_per_col(df,col):
+    q1 = df[col].quantile(0.25)
+    q3 = df[col].quantile(0.75)
+    iqr = q3 - q1
+    # Kolmogorov-Smirnov test to find the distribution of the data
+    dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
+    # if p > 0.05 then the data is normally distributed
+    # if p <= 0.05 then the data is not normally is distributed
+    if p <= 0.05:
+        lower_bound = q1 - 1.5 * iqr
+        upper_bound = q3 + 1.5 * iqr
+        outlier_df = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
+        outlier_per = (len(outlier_df) / len(df[col])) * 100
+    else:
+        z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
+        outlier_df = df[(z_score > 3)]
+        outlier_per = len(outlier_df) / len(df[col]) * 100
+    return outlier_per
+def summary_stats(df,per_to_drop):
+    summary_df = df.isna().sum().reset_index().rename(columns={'index': 'variable', 0: 'null'})
+    summary_df['%null'] = (100 * summary_df['null'] / len(df)).round(2)
+    summary_df = summary_df.merge(df.dtypes.reset_index().rename(columns={'index': 'variable', 0: 'type'}), on='variable')
+    summary_df = summary_df.drop(columns=['null'])
+    summary_df = summary_df.drop(summary_df[summary_df['%null'] > per_to_drop].index)
+    df_numeric = df.select_dtypes(exclude='object')
+    df_categorical = df.select_dtypes(include='object')
+    if not df_numeric.empty:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            summary_df['outlier%'] = summary_df[summary_df['variable'].isin(df_numeric.columns)].apply(lambda x: outlier_per_col(df_numeric, x['variable']), axis=1)
+    else:
+        summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'outlier%': []})])
+    summary_df = summary_df.merge((df.select_dtypes(exclude=['object']).nunique() / df.select_dtypes(exclude=['object']).count() * 100).reset_index().rename(columns={'index': 'variable', 0: 'unique%'}).round(2), on='variable', how='left').round(2)
+    summary_df = summary_df.merge(df.mean(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'mean'}).round(2), on='variable', how='left')
+    summary_df = summary_df.merge(df.std(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'standard deviation'}).round(2), on='variable', how='left')
+    summary_df = (summary_df.merge(df.var(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'variance'}), on='variable', how='left').assign(variance=lambda x: x['variance'].apply(lambda y: "{:.2f}".format(y))))
+    summary_df = summary_df.merge(df.skew(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'skewness'}).round(2), on='variable', how='left')
+    summary_df = summary_df.merge(df.kurt(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'kurtosis'}).round(2), on='variable', how='left')
+    summary_df = summary_df.merge(df.min(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'min'}), on='variable', how='left')
+    summary_df = summary_df.merge(df.max(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'max'}), on='variable', how='left')
+    summary_df['range'] = summary_df['max'] - summary_df['min']
+    if not df_numeric.empty:
+        summary_df = summary_df.merge((df.describe().loc['75%'].T - df.describe().loc['25%'].T).reset_index().rename(columns={'index': 'variable', 0: 'iqr'}), on='variable', how='left')
+    else:
+        summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'iqr': []})])
+    summary_df = summary_df.merge(df.median(numeric_only=True).reset_index().rename(columns={'index': 'variable', 0: 'median'}), on='variable', how='left')
+    if not df_categorical.empty:
+        summary_df = summary_df.merge(df.select_dtypes(include=['object']).mode().iloc[0].reset_index().rename(columns={'index': 'variable', 0: 'mode'}), on='variable', how='left')
+        summary_df = summary_df.merge(df.select_dtypes(include=['object']).nunique().reset_index().rename(columns={'index': 'variable', 0: 'distinct count'}), on='variable', how='left')
+    else:
+        summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'mode': []})])
+        summary_df = pd.concat([summary_df, pd.DataFrame({'variable': [], 'distinct count': []})])
+    return summary_df
+def mean_imputation(df, col):
+    df[col].fillna(round(df[col].mean(), 2), inplace=True)
+def median_imputation(df, col):
+    median = df[col].median()
+    df[col].fillna(round(median, 2), inplace=True)
+def drop_rows(df, col):
+    df.dropna(subset=[col], inplace=True)
+def drop_column(df, col):
+    df.drop(col, axis=1, inplace=True)
+def mode_imputation(df, col):
+    mode = df[col].mode()[0]
+    df[col].fillna(mode, inplace=True)
+def arbitrary_val(df, col, val):
+    df[col].fillna(val, inplace=True)
+def linear_interpolate(df, col):
+    df[col].interpolate(method='linear', inplace=True)
+def polynomial_interpolate(df, col):
+    df[col].interpolate(method='polynomial', order=2, inplace=True)
+def interpolate_padding_forward(df, col):
+    df[col].fillna(method='ffill', inplace=True)
+def interpolate_padding_backward(df, col):
+    df[col].fillna(method='bfill', inplace=True)
+def fill_0(df, col):
+    df[col].fillna(0, inplace=True)
+def remove_outliers(df, col):
+    dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
+    if p <= 0.05:
+        q1 = df[col].quantile(0.25)
+        q3 = df[col].quantile(0.75)
+        iqr = q3 - q1
+        lower_bound = q1 - 1.5 * iqr
+        upper_bound = q3 + 1.5 * iqr
+        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
+    else:
+        z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
+        df = df[(z_score < 3)]
+    return df
+def mean_outlier(df, col):
+    dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
+    if p <= 0.05:
+        q1 = df[col].quantile(0.25)
+        q3 = df[col].quantile(0.75)
+        iqr = q3 - q1
+        lower_bound = q1 - 1.5 * iqr
+        upper_bound = q3 + 1.5 * iqr
+        df[col][df[col] < lower_bound] = df[col].mean()
+        df[col][df[col] > upper_bound] = df[col].mean()
+    else:
+        z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
+        df.loc[z_score > 3, col] = df[col].mean()
+    return df
+def median_outlier(df, col):
+    dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
+    if p <= 0.05:
+        q1 = df[col].quantile(0.25)
+        q3 = df[col].quantile(0.75)
+        iqr = q3 - q1
+        lower_bound = q1 - 1.5 * iqr
+        upper_bound = q3 + 1.5 * iqr
+        df[col][df[col] < lower_bound] = df[col].median()
+        df[col][df[col] > upper_bound] = df[col].median()
+    else:
+        z_score = np.abs(df[col] - df[col].mean()) / df[col].std()
+        df.loc[z_score > 3, col] = df[col].median()
+    return df
+def outlier_capping(df, col):
+    dist_name, p = stats.normaltest(df[col])[0], stats.normaltest(df[col])[1]
+    if p <= 0.05:
+        q1 = df[col].quantile(0.25)
+        q3 = df[col].quantile(0.75)
+        iqr = q3-q1
+        lower_bound = q1-1.5*iqr
+        upper_bound = q1+1.5*iqr
+        df[col] = np.where(df[col] >= upper_bound, upper_bound, np.where(df[col] <= lower_bound, lower_bound, df[col]))
+    else:
+        upper_limit = df[col].mean() + (3 * df[col].std())
+        lower_limit = df[col].mean() - (3 * df[col].std())
+        df[col] = np.where(df[col] >= upper_limit, upper_limit, np.where(df[col] <= lower_limit, lower_limit, df[col]))
+    return df
+def perform_treatment_missing(df, col, treatments):
+    if treatments == 'mean':
+        mean_imputation(df, col)
+    elif treatments == 'median':
+        median_imputation(df, col)
+    elif treatments == 'drop row':
+        drop_rows(df, col)
+    elif treatments == 'drop column':
+        drop_column(df, col)
+    elif treatments == 'linear interpolation':
+        linear_interpolate(df, col)
+    elif treatments == 'polynomial interpolation':
+        polynomial_interpolate(df, col)
+    elif treatments == 'ffill':
+        interpolate_padding_forward(df, col)
+    elif treatments == 'bfill':
+        interpolate_padding_backward(df, col)
+    elif treatments == 'mode':
+        mode_imputation(df, col)
+    elif treatments == 'fill_0':
+        fill_0(df, col)
+    else:
+        return df[col]
+def perform_treatment_outlier(df, col, treatments):
+    if treatments == 'remove':
+        remove_outliers(df,col)
+    elif treatments == 'mean':
+        mean_outlier(df,col)
+    elif treatments == 'median':
+        median_imputation(df,col)
+    elif treatments == 'capping':
+        outlier_capping(df,col)
+    else:
+        return df[col]
+def imputed_df(df,edited_df,identifier,flag,per_to_drop=None):
+    if per_to_drop is not None:
+        null_percentage = df.isnull().sum() / df.shape[0] * 100
+        col_to_drop = null_percentage[null_percentage > per_to_drop].keys()
+        df = df.drop(col_to_drop, axis=1)
+    cols_with_one_unique = df.columns[df.nunique() == 1]
+    df.drop(cols_with_one_unique, axis=1, inplace=True)
+    for col in edited_df['variable'].to_list():
+        perform_treatment_missing(df,col, edited_df.loc[edited_df['variable'] == col, 'Imputation method'].iloc[0])
+        perform_treatment_outlier(df,col, edited_df.loc[edited_df['variable'] == col, 'Outlier Treatment'].iloc[0])
+    return df
+# flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns))
+# identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns))
+# numerical_columns = st.session_state.df.select_dtypes(include=['number']).columns.tolist()
+# numerical_columns = [x for x in numerical_columns if x !=flag]
+# categorical_columns = st.session_state.df.select_dtypes(include=['object', 'category']).columns.tolist()
+# categorical_columns = [x for x in categorical_columns if x !=identifier]
+# st.session_state.flag=flag
+# st.session_state.identifier=identifier
+st.title("Data Summary")
+with st.expander("Data Inputs"):
+    st.subheader("Data Inputs")
+    ui_columns = st.columns((1, 1))
+    columns = set(st.session_state.df.columns)
+    with ui_columns[0]:
+        flag = st.selectbox(
+            label="Flag variable",
+            options=list(columns),
+            index=list(columns).index(st.session_state.flag) if 'flag' in st.session_state and st.session_state.flag is not None else 0
+        )
+        per_to_drop=st.slider(
+            label= "Select missing % threshold to drop columns",
+            key="per_to_drop",
+            min_value=0, max_value=100, value=st.session_state.per_to_drop if 'per_to_drop' in st.session_state else 80)
+    with ui_columns[-1]:
+        identifier = st.selectbox(
+            label="Identifier",
+            options=list(columns),
+            index=list(columns).index(st.session_state.identifier) if 'identifier' in st.session_state and st.session_state.identifier is not None else 0
+        )
+# numerical_columns = st.session_state.df.select_dtypes(include=['number']).columns.tolist()
+# numerical_columns = [x for x in numerical_columns if x !=flag]
+# categorical_columns = st.session_state.df.select_dtypes(include=['object', 'category']).columns.tolist()
+# categorical_columns = [x for x in categorical_columns if x !=identifier]
+# st.session_state.numerical_columns=numerical_columns
+# st.session_state.categorical_columns=categorical_columns
+st.session_state.flag=flag
+st.session_state.identifier=identifier
+# st.subheader("Select Ordinal Columns:")
+# with st.expander("Select Ordinal Columns:", expanded=True):
+#     select_all_checkbox = st.checkbox("Select All", key="select_all_checkbox")
+#     options = categorical_columns
+#     # Checkboxes for each column
+#     ordinal_columns = []
+#     for option in options:
+#         if select_all_checkbox or st.checkbox(option, key=f"checkbox_{option}"):
+#             ordinal_columns.append(option)
+#     st.session_state.ordinal_columns=list(ordinal_columns)
+# nominal_columns=[x for x in categorical_columns if x not in ordinal_columns]
+# st.session_state.numerical_columns=numerical_columns
+# st.session_state.categorical_columns=categorical_columns
+# st.session_state.ordinal_columns=ordinal_columns
+#Ordinal columns order
+# ordinal_col_dict = st.session_state.get("ordinal_col_dict", {})
+# ordinal_col_dict = {}
+# for col in ordinal_columns:
+#     st.subheader(f"Ordering for Unique Values in {col}")
+#     # Get unique values excluding NaN
+#     unique_values = st.session_state.df[col].dropna().unique()
+#     order_dict = {}
+#     for val in unique_values:
+#         order = st.number_input(f"Order for {val} in {col}", min_value=1, value=1)
+#         order_dict[val] = order
+#     ordinal_col_dict[col] = order_dict
+# st.session_state.ordinal_col_dict = ordinal_col_dict
+# User input for percentage threshold to drop columns
+# per_to_drop = st.slider("Select Percentage Threshold to Drop Columns", min_value=0, max_value=100, value=10)
+# st.session_state.per_to_drop = per_to_drop
+summary_df = summary_stats(st.session_state.df, per_to_drop)
+summary_df["Imputation method"]=None
+summary_df["Outlier Treatment"]=None
+summary_df["Imputation method"]=np.where(summary_df["type"]=='object','mode','mean')
+summary_df["Outlier Treatment"]=np.where(summary_df["type"]=='object',summary_df["Outlier Treatment"],'capping')
+summary_df = summary_df[~summary_df['variable'].isin([flag,identifier])]
+st.session_state.summary_df=summary_df
+st.subheader("Variable Summary")
+IMPUTATION_OPTIONS = ["mean", "median", "linear interpolation", "polynomial interpolation", "ffill", "bfill","mode","fill_0"]
+OUTLIER_OPTIONS = ["capping","remove", "mean", "median"]
+NON_EDITABLE_COLUMNS = summary_df.columns.to_list()
+def highlight_cols(s):
+    color = "#ccc"
+    return "background-color: %s" % color
+column_config = {
+    "variable": st.column_config.TextColumn(disabled=True, width="medium"),
+    "type": st.column_config.TextColumn(disabled=True, width="medium"),
+    "%null": st.column_config.NumberColumn(disabled=True),
+    "unique%": st.column_config.NumberColumn(disabled=True),
+    "outlier%": st.column_config.NumberColumn(disabled=True),
+    "mean": st.column_config.NumberColumn(disabled=True),
+    "standard deviation": st.column_config.NumberColumn(disabled=True),
+    "variance": st.column_config.NumberColumn(disabled=True),
+    "skewness": st.column_config.NumberColumn(disabled=True),
+    "kurtosis": st.column_config.NumberColumn(disabled=True),
+    "min": st.column_config.NumberColumn(disabled=True),
+    "max": st.column_config.NumberColumn(disabled=True),
+    "range": st.column_config.NumberColumn(disabled=True),
+    "iqr": st.column_config.NumberColumn(disabled=True),
+    "median": st.column_config.NumberColumn(disabled=True),
+    "IV": st.column_config.NumberColumn(disabled=True),
+    "mode": st.column_config.TextColumn(disabled=True),
+    "distinct count": st.column_config.NumberColumn(disabled=True),
+    "Imputation method": st.column_config.SelectboxColumn(
+        options=IMPUTATION_OPTIONS, default=0
+    ),
+    "Outlier Treatment": st.column_config.SelectboxColumn(
+        options=OUTLIER_OPTIONS, default=0
+    )
+}
+with st.expander("Variables from the data"):
+        edited_df = st.data_editor(
+            st.session_state.summary_df
+            .style.hide(axis="index")
+            .applymap(highlight_cols, subset=NON_EDITABLE_COLUMNS),
+            column_config=column_config,
+        )
+if st.button("Submit changes"):
+    with st.spinner("Applying imputations"):
+        st.divider()
+        edited_df = st.session_state.summary_df.copy()  # Make a copy of the original DataFrame
+        edited_df["Imputation method"] = st.session_state.summary_df["Imputation method"]  # Update the imputation method column
+        edited_df["Outlier Treatment"] = st.session_state.summary_df["Outlier Treatment"]  # Update the outlier treatment method column
+        imputed_df = imputed_df(st.session_state.df, edited_df, st.session_state.identifier, st.session_state.flag, st.session_state.per_to_drop)
+        st.session_state.imputed_df = imputed_df
+        st.markdown("Imputed DataFrame")
+        st.dataframe(imputed_df.head(10))
+# Add a download button for the imputed DataFrame
+#if st.session_state.imputed_df is not None:
+#    csv_data = st.session_state.imputed_df.to_csv(index=False).encode()
+#    st.download_button(
+#        label="Download Imputed DataFrame as CSV",
+#        data=csv_data,
+#        file_name="imputed_data.csv",
+#       mime="text/csv"
+#   )
+# Add the download button after displaying the DataFrame
+#if st.dataframe:
+#    if st.button("Download Imputed Data"):
+#        imputed_csv = imputed_df.to_csv(index=False)
+#        b64 = base64.b64encode(imputed_csv.encode()).decode()
+#        href = f'<a href="data:file/csv;base64,{b64}" download="imputed_data.csv">Download Imputed Data CSV File</a>'
+#        st.markdown(href, unsafe_allow_html=True)
+if "imputed_df" in st.session_state:
+    if st.button("Download Imputed Data"):
+        imputed_df = st.session_state.imputed_df
+        imputed_csv = imputed_df.to_csv(index=False)
+        b64 = base64.b64encode(imputed_csv.encode()).decode()
+        href = f'<a href="data:file/csv;base64,{b64}" download="imputed_data.csv">Download Imputed Data CSV File</a>'
+        st.markdown(href, unsafe_allow_html=True)
+# Check if the "Submit changes" button has been clicked
+# if st.button("Submit"):
+#     st.write("Selected Columns and Ordinal Orders:")
+#     st.write(ordinal_col_dict)
+#     # Display summary stats
+#     summary_df = summary_stats(st.session_state.df, per_to_drop)
+#     st.write("Summary Stats:")
+#     st.write(summary_df)
+# # User input for specific column
+# col_name = st.selectbox("Select a specific column name:", [None] + list(st.session_state.df.columns))
+# # Display stats for the specified column
+# if col_name in st.session_state.df.columns:
+#     st.write(f"Stats for column '{col_name}':")
+#     # Extract relevant information from 'summary_df' for the specific column
+#     col_summary = summary_df[summary_df['variable'] == col_name][['%null', 'type', 'outlier%', 'unique%', 'mean', 'standard deviation', 'variance', 'skewness', 'kurtosis', 'min', 'max', 'range', 'iqr', 'median', 'mode', 'distinct count']]
+#     col_summary = col_summary.T.reset_index()
+#     col_summary.columns = ['Stats', 'Value']
+#     # Display the summary statistics as a table
+#     st.table(col_summary)
+# else:
+#     st.warning("Please enter a valid column name.")

pages/pages/2_Profiling.py ADDED Viewed

	@@ -0,0 +1,775 @@

+import numpy as np
+import pandas as pd
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.model_selection import GridSearchCV
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+from matplotlib.ticker import MaxNLocator
+import streamlit as st
+import ast
+from collections import defaultdict
+from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
+from sklearn.cluster import KMeans, AgglomerativeClustering
+from sklearn.preprocessing import LabelEncoder
+#from kmodes.kmodes import KModes
+import matplotlib.pyplot as plt
+import seaborn as sns
+#from kmodes.kprototypes import KPrototypes
+import warnings
+import pandas as pd
+import numpy as np
+from scipy import stats
+import scipy.cluster.hierarchy as sch
+from scipy.spatial.distance import pdist
+import os
+import re
+import time
+from plotly.subplots import make_subplots
+import plotly.graph_objects as go
+import numpy as np
+import plotly.express as px
+import base64
+def tree_based_bin_data(df, column_name, dep_var, depth_of_tree):
+    df2 = df.copy()
+    df2 = df2.loc[df2[column_name].notnull()]
+    x = df2[column_name].values.reshape(-1, 1)
+    y = df2[dep_var].values
+    params = {'max_depth': range(2, depth_of_tree + 1), 'min_samples_split': [2, 3, 5, 10], 'min_samples_leaf': [int(np.ceil(0.05 * len(x)))]}
+    clf = DecisionTreeClassifier()
+    g_search = GridSearchCV(clf, param_grid=params, scoring='accuracy')
+    g_search.fit(x, y)
+    best_clf = g_search.best_estimator_
+    bin_edges = best_clf.tree_.threshold
+    bin_edges = sorted(set(bin_edges[bin_edges != -2]))
+    tree_based_binned_data = value_bin_data(df, column_name, bin_edges)
+    return tree_based_binned_data
+def decile_bin_data(df, col, no_of_bins):
+    decile_binned_data = pd.qcut(df[col], no_of_bins, duplicates='drop')
+    return decile_binned_data
+def value_bin_data(df, col, no_of_bins):
+    value_binned_data = pd.cut(df[col], no_of_bins, duplicates='drop')
+    return value_binned_data
+def col_bin_summary_numerical(bin_df, col, dep_var=None):
+    unique_bin_edges = bin_df[col].unique()
+    df_new = pd.DataFrame({"bin_ranges": unique_bin_edges})
+    try:
+        df_new = df_new.merge((bin_df[col].value_counts() / len(bin_df) * 100).reset_index().rename(columns={'index': 'bin_ranges', col: 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2)
+    except:
+        df_new = df_new.merge((bin_df[col].value_counts() / len(bin_df) * 100).reset_index().rename(columns={col: 'bin_ranges', 'count': 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2)
+    if dep_var is not None:
+        df_new = df_new.merge(bin_df.groupby(col)[dep_var].sum().reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Event'}), on='bin_ranges', how='left')
+        df_new = df_new.merge(bin_df.groupby(col)[dep_var].mean().reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Mean_DV'}), on='bin_ranges', how='left')
+        df_new['Index'] = (100 * df_new['Mean_DV'] / bin_df['Y'].mean()).round()
+        df_new = df_new[['bin_ranges', 'count%', 'Event', 'Mean_DV', 'Index']]
+        df_new = df_new.sort_values(by='bin_ranges')
+    return df_new
+def plot_chart(df, col, dep_var):
+    #fig = go.Figure()
+    df['bin_ranges_str'] = df['bin_ranges'].astype(str)
+    fig = make_subplots(specs=[[{"secondary_y": True}]])
+    # Bar trace for Count%
+    fig.add_trace(
+        go.Bar(
+            x=df['bin_ranges_str'],
+            y=df['count%'],
+            name='Count%',
+            marker_color='#053057',
+            hovertemplate=(
+                f"Bin: %{{x}}<br>"
+                f"Count%: %{{y}}"
+            ),
+        )
+    )
+    # Add the line trace for Index on the secondary y-axis
+    fig.add_trace(
+        go.Scatter(
+            x=df['bin_ranges_str'],
+            y=df['Index'],
+            mode='lines+markers',
+            name='Index',
+            marker=dict(color="#8ac4f8"),
+            hovertemplate=(
+                f"Bin: %{{x}}<br>"
+                f"Index%: %{{y}}"
+            ),
+        ),
+        secondary_y=True
+    )
+    # Update layout
+    fig.update_layout(
+        title=f'Distribution of {col}',
+        xaxis=dict(title='Bin_ranges'),
+        yaxis=dict(title='Count%', color='#053057'),
+        yaxis2=dict(title='Index', color="#8ac4f8", overlaying='y', side='right'),
+        legend=dict(x=1.02, y=0.98),
+        hovermode='x'
+    )
+    fig.update_xaxes(showgrid=False)
+    fig.update_yaxes(showgrid=False)
+    return fig
+# def plot_chart(df, col, dep_var=None):
+#     fig, ax1 = plt.subplots(figsize=(10, 6))
+#     # Convert Interval type to string
+#     df['bin_ranges_str'] = df['bin_ranges'].astype(str)
+#     ax1.bar(df['bin_ranges_str'], df['count%'], color='b', alpha=0.7, label='Count%')
+#     ax1.set_xlabel('Bin Ranges')
+#     ax1.set_ylabel('Count%', color='b')
+#     if dep_var is not None:
+#         ax2 = ax1.twinx()
+#         ax2.plot(df['bin_ranges_str'], df['Index'], color='r', marker='o', label='Index')
+#         ax2.set_ylabel('Index', color='r')
+#     ax1.set_title(f'Distribution of {col}')
+#     ax1.legend(loc='upper left')
+#     return st.plotly_chart(fig)
+def create_numerical_binned_data(df, col, func,no_of_bins=None,dep_var=None, depth=None):
+    df_org = df.copy()
+    if dep_var is not None:
+        df_org[dep_var] = df_org[dep_var].astype('int64')
+        df_num = df_org.select_dtypes(include=[np.number]).drop(dep_var, axis=1)
+        if func == 'tree':
+            bin_df = tree_based_bin_data(df, col, dep_var, depth)
+        elif func == 'decile':
+            bin_df = decile_bin_data(df_num, col, 10)
+        else:
+            bin_df = value_bin_data(df_num, col, no_of_bins)
+        bin_df = pd.concat([bin_df, df_org[dep_var]], axis=1)
+    else:
+        df_num = df_org.select_dtypes(include=[np.number])
+        if func == 'decile':
+            bin_df = decile_bin_data(df_num, col, no_of_bins)
+        else:
+            bin_df = value_bin_data(df_num, col, no_of_bins)
+    df_summary = col_bin_summary_numerical(bin_df,col, dep_var)
+    return df_summary
+def create_numerical_binned_data1(df, col, func,no_of_bins,dep_var,depth=None):
+    df_org = df.copy()
+    df_org[dep_var] = df_org[dep_var].astype('int64')
+    df_num = df_org.select_dtypes(include=[np.number]).drop(dep_var, axis=1)
+    if func == 'tree':
+        bin_df = tree_based_bin_data(df, col, dep_var, depth)
+    elif func == 'decile':
+        bin_df = decile_bin_data(df_num, col, no_of_bins)
+    else:
+        bin_df = value_bin_data(df_num, col, no_of_bins)
+    bin_df = pd.concat([bin_df, df_org[dep_var]], axis=1)
+    binned_data=pd.DataFrame()
+    binned_data[col]=df_org[col]
+    unique_bins = bin_df[col].unique()
+    for bin_value in unique_bins:
+        bin_column_name = f"{col}_{bin_value}"
+        binned_data[bin_column_name] = np.where(binned_data[col] == bin_value, df_org[col], 0)
+    return binned_data
+#Categorical cols binning
+def woe_iv(df, column_name, dep_var, no_of_bins):
+    y0 = df[dep_var].value_counts()[0]
+    y1 = df[dep_var].value_counts()[1]
+    if df[column_name].nunique() < 10:
+        data = pd.Series(pd.factorize(df[column_name])[0] + 1, index=df.index).rename('{}'.format(column_name)).apply(lambda x: f'bin{x}')
+    else:
+        df_woe_iv = (pd.crosstab(df[column_name], df[dep_var], normalize='columns').assign(woe=lambda dfx: np.log((dfx[1] + (0.5 / y1)) / (dfx[0] + (0.5 / y0)))).assign(iv=lambda dfx: (dfx['woe'] * (dfx[1] - dfx[0]))))
+        woe_map = df_woe_iv['woe'].to_dict()
+        woe_col = df[column_name].map(woe_map)
+        data = pd.qcut(woe_col, no_of_bins, duplicates='drop')
+        n = data.nunique()
+        labels = [f'bin{i}' for i in range(1, n + 1)]
+        data = data.cat.rename_categories(labels)
+        sizes = data.value_counts(normalize=True)
+        min_size = 0.05
+        while sizes.min() < min_size and no_of_bins > 1:
+            no_of_bins -= 1
+            data = pd.qcut(woe_col, q=no_of_bins, duplicates='drop')
+            if data.nunique() != data.cat.categories.nunique():
+                continue
+            n = data.nunique()
+            labels = [f'bin{i}' for i in range(1, n + 1)]
+            data = data.cat.rename_categories(labels)
+            sizes = data.value_counts(normalize=True)
+    return data
+def naive_cat_bin(df, col, max_thre=10, min_thre=5, tolerence=2, flag='ignore'):
+    value_counts = df[col].value_counts()
+    total_values = len(df)
+    count_percentages = (value_counts / total_values) * 100
+    unique_values_df = pd.DataFrame({'Category': value_counts.index, 'Count Percentage': count_percentages})
+    count_per = list(unique_values_df['Count Percentage'])
+    final_ini = []
+    for i in count_per:
+        if i >= min_thre:
+            final_ini.append(i)
+    a = [x for x in count_per if x not in final_ini]
+    total_bins = int(100 / max_thre)
+    ava_bins = len(final_ini)
+    ava_bin_per = sum(final_ini)
+    bin_req = total_bins - ava_bins
+    bin_req_per = 100 - ava_bin_per
+    if flag == 'error' and bin_req > 0 and (bin_req_per / bin_req) > max_thre:
+        print(f"Binning for {col} is not possible with given parameters.")
+        return
+    step = False
+    while not step:
+        if bin_req > 0:
+            if (bin_req_per / bin_req) > min_thre:
+                step = True
+            else:
+                bin_req -= 1
+        else:
+            step = True
+    final_ini = [[x] for x in final_ini]
+    if bin_req > 0:
+        target_sum = bin_req_per / bin_req
+    else:
+        target_sum = bin_req_per
+        tolerence = 0
+    final = []
+    current_sum = 0.0
+    start_index = len(a) - 1
+    values = []
+    while start_index >= 0:
+        current_sum += a[start_index]
+        values.append(a[start_index])
+        if current_sum < target_sum - tolerence:
+            start_index -= 1
+        else:
+            final.append(values)
+            values = []
+            start_index -= 1
+            current_sum = 0.0
+    final.append(values)
+    final = final[::-1]
+    final = [sublist for sublist in final if sublist]
+    final_b = final_ini + final
+    final = [final_b[0]]
+    for subarr in final_b[1:]:
+        if sum(subarr) < (min_thre - tolerence):
+            final[-1].extend(subarr)
+        else:
+            final.append(subarr)
+    table = dict(zip(unique_values_df['Category'], unique_values_df['Count Percentage']))
+    new_final = [sublist.copy() for sublist in final]
+    table_reverse = defaultdict(list)
+    for k, v in table.items():
+        table_reverse[v].append(k)
+    output = []
+    for l in new_final:
+        temp = []
+        for item in l:
+            temp.append(table_reverse[item].pop())
+        output.append(temp)
+    new_final = output
+    k = len(new_final)
+    bin_labels = [f'bin{i}' for i in range(1, k + 1)]
+    bin_mapping = {value: bin_labels[i] for i, sublist in enumerate(new_final) for value in sublist}
+    bin_mapping[np.nan] = 'binNA'
+    return df[col].apply(lambda x: bin_mapping.get(x, x))
+def col_bin_summary_categorical(df_cat, col, binned_df_1,dep_var=None):
+    unique_values_in_bins = df_cat.groupby(binned_df_1[col])[col].unique().apply(list)
+    unique_values_in_bins = unique_values_in_bins.rename_axis('bin').reset_index()
+    unique_bin_ranges = pd.Categorical(binned_df_1[col].unique())
+    uni = binned_df_1[col].nunique()
+    numeric_parts = [uni if val == 'binNA' else int(re.findall(r'\d+', val)[0]) for val in unique_bin_ranges]
+    unique_bin_ranges = unique_bin_ranges[np.argsort(numeric_parts)]
+    df_new_cat = pd.DataFrame({"column_name": [col] * len(unique_bin_ranges), "bin_ranges": unique_bin_ranges})
+    df_new_cat = df_new_cat.merge(unique_values_in_bins.rename(columns={'bin': 'bin_ranges', col: 'values in bin'}))
+    df_new_cat = df_new_cat.merge((binned_df_1[col].value_counts() / len(binned_df_1) * 100).reset_index().rename(columns={col: 'bin_ranges', 'count': 'count%'}).sort_values(by='bin_ranges').reset_index(drop=True), on='bin_ranges').round(2)
+    if dep_var is not None:
+        df_new_cat = df_new_cat.merge(binned_df_1.groupby(col)[dep_var].sum(numeric_only=True).reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Event'}), on='bin_ranges')
+        df_new_cat = df_new_cat.merge(binned_df_1.groupby(col)[dep_var].mean(numeric_only=True).reset_index().rename(columns={col: 'bin_ranges', dep_var: 'Mean_DV'}), on='bin_ranges')
+        df_new_cat['Index'] = (100 * df_new_cat['Mean_DV'] / binned_df_1[dep_var].mean()).round()
+    return df_new_cat
+def create_categorical_binned_data(imputed_df,col, categorical_binning, dep_var, no_of_bins=None, max_thre=None, min_thre=None,tolerence=2, flag='ignore'):
+    imputed_df[dep_var] = imputed_df[dep_var].astype('int64')
+    df_cat = imputed_df.select_dtypes(include=['object'])
+    # remove columns with only one unique values
+    unique_counts = df_cat.nunique()
+    unique_cols = unique_counts[unique_counts == 1].index.tolist()
+    df_cat = df_cat.drop(unique_cols, axis=1)
+    if categorical_binning == 'woe_iv':
+        df_nominal = pd.concat([imputed_df[col], imputed_df[dep_var]], axis=1)
+        tqdm.pandas(dynamic_ncols=True, position=0)
+        binned_df_nominal = df_nominal.progress_apply(lambda x: woe_iv(df_nominal, x.name, dep_var, no_of_bins))
+        binned_df_nominal.drop(dep_var, axis=1, inplace=True)
+        binned_df_nominal = binned_df_nominal.applymap(lambda x: 'NA' if pd.isnull(x) else x)
+        binned_df_nominal = binned_df_nominal.astype('category')
+        cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
+        binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
+        binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dep_var]], axis=1)
+    elif categorical_binning == 'naive':
+        df_nominal = pd.concat([imputed_df[col], imputed_df[dep_var]], axis=1)
+        tqdm.pandas(dynamic_ncols=True, position=0)
+        binned_df_nominal = df_nominal.progress_apply(lambda x: naive_cat_bin(df_nominal, x.name, 20, 5, 2, flag='ignore'))
+        binned_df_nominal.drop(dep_var, axis=1, inplace=True)
+        binned_df_nominal = binned_df_nominal.dropna(axis=1, how='all')
+        binned_df_nominal = binned_df_nominal.astype('category')
+        cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
+        binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
+        binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dep_var]], axis=1)
+    df_summary=col_bin_summary_categorical(df_cat, col, binned_df_nominal_1,dep_var)
+    return df_summary
+def create_categorical_binned_data1(imputed_df,col, nominal_binning, dependant_target_variable, no_of_bins=10, max_thre=10, min_thre=5, tolerence=2, flag='ignore', min_cluster_size=0.05, max_clusters=10):
+    imputed_df[dependant_target_variable] = imputed_df[dependant_target_variable].astype('int64')
+    df_cat = imputed_df.select_dtypes(include=['object'])
+    # remove columns with only one unique values
+    unique_counts = df_cat.nunique()
+    unique_cols = unique_counts[unique_counts == 1].index.tolist()
+    df_cat = df_cat.drop(unique_cols, axis=1)
+    if nominal_binning == 'woe':
+        df_nominal = pd.concat([imputed_df[col], imputed_df[dependant_target_variable]], axis=1)
+        tqdm.pandas(dynamic_ncols=True, position=0)
+        binned_df_nominal = df_nominal.progress_apply(lambda x: woe_iv(df_nominal, x.name, dependant_target_variable, no_of_bins))
+        binned_df_nominal.drop(dependant_target_variable, axis=1, inplace=True)
+        binned_df_nominal = binned_df_nominal.applymap(lambda x: 'NA' if pd.isnull(x) else x)
+        binned_df_nominal = binned_df_nominal.astype('category')
+        cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
+        binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
+        binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dependant_target_variable]], axis=1)
+    elif nominal_binning == 'naive':
+        df_nominal = pd.concat([imputed_df[col], imputed_df[dependant_target_variable]], axis=1)
+        tqdm.pandas(dynamic_ncols=True, position=0)
+        binned_df_nominal = df_nominal.progress_apply(lambda x: naive_cat_bin(df_nominal, x.name, 20, 5, 2, flag='ignore'))
+        binned_df_nominal.drop(dependant_target_variable, axis=1, inplace=True)
+        binned_df_nominal = binned_df_nominal.dropna(axis=1, how='all')
+        binned_df_nominal = binned_df_nominal.astype('category')
+        cols_with_one_unique_bin = binned_df_nominal.columns[binned_df_nominal.nunique() == 1]
+        binned_df_nominal.drop(cols_with_one_unique_bin, axis=1, inplace=True)
+        binned_df_nominal_1 = pd.concat([binned_df_nominal, imputed_df[dependant_target_variable]], axis=1)
+    df_summary=col_bin_summary_categorical(df_cat, col, binned_df_nominal_1,dependant_target_variable)
+    binned_data = pd.DataFrame()
+    for bin_value in df_summary['values in bin']:
+        bin_column_name = f"{col}_{bin_value}"
+        binned_data[bin_column_name] = np.where(df_cat[col].isin(bin_value), 1, 0)
+    return binned_data
+numerical_columns = st.session_state.imputed_df.select_dtypes(include=['number']).columns.tolist()
+numerical_columns = [x for x in numerical_columns if x != st.session_state.flag]
+categorical_columns = st.session_state.imputed_df.select_dtypes(include=['object', 'category']).columns.tolist()
+categorical_columns = [x for x in categorical_columns if x != st.session_state.identifier]
+st.session_state.numerical_columns=numerical_columns
+st.session_state.categorical_columns=categorical_columns
+st.title("Variable Profiling")
+# Retrieve stored options from session_state or use default values
+function_num = st.session_state.get("function_num", "value")
+depth = st.session_state.get("depth", 3)
+num_bins = st.session_state.get("num_bins", 10)
+function_cat = st.session_state.get("function_cat", "woe_iv")
+max_slider = st.session_state.get("max_slider", 10)
+min_slider = st.session_state.get("min_slider", 5)
+cat_bins_iv = st.session_state.get("cat_bins_iv", 10)
+cat_bins_naive = st.session_state.get("cat_bins_naive", 10)
+with st.expander("Profiling Inputs"):
+    st.write("Binning Inputs")
+    ui_columns = st.columns((1, 1))
+    with ui_columns[0]:
+        function_num = st.selectbox(
+            label="Select Numerical Binning Function",
+            options=['value', 'tree'],
+            #index=None
+            index=['value', 'tree'].index(st.session_state.function_num) if 'function_num' in st.session_state and st.session_state.function_num is not None else None
+        )
+        st.session_state.function_num = function_num  # Store selected option
+    params_num = st.empty()
+    with params_num:
+        with ui_columns[-1]:
+            if function_num == 'tree':
+                depth = st.slider(
+                    label="Depth",
+                    min_value=1,
+                    max_value=10,
+                    value=depth,
+                    key='depth_slider')
+                st.session_state.depth = depth  # Store selected depth
+            elif function_num == 'value':
+                num_bins = st.slider(
+                    label="Number of Bins",
+                    min_value=2,
+                    max_value=20,
+                    value=num_bins,
+                    key='num_bins_slider_num')
+                st.session_state.num_bins = num_bins  # Store selected number of bins
+    left, right = st.columns(2)
+    with left:
+        function_cat = st.selectbox(
+            label="Select Categorical Binning Function",
+            options=['woe_iv', 'naive'],
+            #index=None
+            index=['woe_iv', 'naive'].index(st.session_state.function_cat) if 'function_cat' in st.session_state and st.session_state.function_cat is not None else None
+        )
+        st.session_state.function_cat = function_cat  # Store selected option
+    params_cat = st.empty()
+    with params_cat:
+        if function_cat == 'woe_iv':
+            with right:
+                cat_bins_iv = st.slider(
+                    label="Number of Bins",
+                    min_value=2,
+                    max_value=20,
+                    value=cat_bins_iv,
+                    key='num_bins_slider_cat_iv')
+                st.session_state.cat_bins_iv = cat_bins_iv  # Store selected number of bins
+            with left:
+                min_slider = st.slider(
+                    label="Min Threshold",
+                    min_value=1,
+                    max_value=100,
+                    value=min_slider,
+                    key='min_slider')
+                st.session_state.min_slider = min_slider  # Store selected min threshold
+            with right:
+                max_slider = st.slider(
+                    label="Max Threshold",
+                    min_value=1,
+                    max_value=100,
+                    value=max_slider,
+                    key='max_slider')
+                st.session_state.max_slider = max_slider  # Store selected max threshold
+        elif function_cat == 'naive':
+            with right:
+                cat_bins_naive = st.slider(
+                    label="Number of Bins",
+                    min_value=2,
+                    max_value=20,
+                    value=cat_bins_naive,
+                    key='num_bins_slider_cat_naive')
+                st.session_state.cat_bins_naive = cat_bins_naive  # Store selected number of bins
+    with left:
+        st.write("#")
+        perform_profiling = st.button(
+            label="Perform profiling"
+        )
+# if perform_profiling:
+#     binned_data_num = pd.DataFrame()
+#     for col in st.session_state.numerical_columns:
+#         if function_num == 'tree':
+#             depth = depth
+#         else:
+#             depth=None
+#         if function_num == 'value':
+#             num_bins=num_bins
+#         else:
+#             num_bins=None
+#         binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth)
+#         binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
+#         binned_data_num = pd.concat([binned_data_num, binned_data_col],axis=0)
+#     st.markdown("binned_data_num")
+#     st.dataframe(binned_data_num,use_container_width=True,hide_index=True)
+if perform_profiling:
+    with st.expander("Profiling summary"):
+        st.write("Numerical binned data")
+        binned_data_num = pd.DataFrame()
+        for col in st.session_state.numerical_columns:
+            if function_num == 'tree':
+                depth = depth
+            else:
+                depth=None
+            if function_num == 'value':
+                num_bins=num_bins
+            else:
+                num_bins=None
+            binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth)
+            binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
+            binned_data_num = pd.concat([binned_data_num, binned_data_col],axis=0)
+        st.dataframe(binned_data_num,use_container_width=True,hide_index=True)
+        st.write("Categorical binned data")
+        binned_data_cat = pd.DataFrame()
+        for col in st.session_state.categorical_columns:
+            if function_cat == 'woe_iv':
+                max_thre = max_slider
+                min_thre = min_slider
+                no_of_bins = cat_bins_iv
+            else:
+                max_thre = None
+                min_thre = None
+                no_of_bins = None
+            if function_cat == 'naive':
+                no_of_bins = cat_bins_naive
+            else:
+                no_of_bins=None
+            binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df,col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre,tolerence=2, flag='ignore')
+            binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str))
+            binned_data_col_cat.drop('column_name',axis=1,inplace=True)
+            binned_data_cat = pd.concat([binned_data_cat, binned_data_col_cat],axis=0)
+        st.dataframe(binned_data_cat,use_container_width=True,hide_index=True)
+    with st.expander("Profiling summary: Plots"):
+        st.markdown(
+        "<p class='plot-header'>Change the selected variable to plot"
+        " different charts</p>",
+        unsafe_allow_html=True,
+        )
+        left, right = st.columns(2)
+        with left:
+            if 'selected_variable' not in st.session_state:
+                st.session_state.selected_variable = []  # Initialize selected_variable
+            selected_variable = st.selectbox(
+                "Variable",
+                st.session_state.numerical_columns + st.session_state.categorical_columns,
+                # index=None
+            )
+            if isinstance(selected_variable, str):
+                selected_variable = [selected_variable]  # Convert single selection to list
+            # Update session state with selected variable
+            st.session_state.selected_variable = selected_variable
+        # Iterate over selected variable(s)
+        if st.session_state.selected_variable:
+            for col in st.session_state.selected_variable:
+                if col in st.session_state.numerical_columns:
+                    if function_num == 'tree':
+                        depth = depth
+                    else:
+                        depth = None
+                    if function_num == 'value':
+                        num_bins = num_bins
+                    else:
+                        num_bins = None
+                    binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num, num_bins, st.session_state.flag, depth)
+                    binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
+                    fig = plot_chart(binned_data_col, col, dep_var=None)
+                    st.plotly_chart(fig, use_container_width=True)
+                elif col in st.session_state.categorical_columns:
+                    if function_cat == 'woe_iv':
+                        max_thre = max_slider
+                        min_thre = min_slider
+                        no_of_bins = cat_bins_iv
+                    else:
+                        max_thre = None
+                        min_thre = None
+                        no_of_bins = None
+                    if function_cat == 'naive':
+                        no_of_bins = cat_bins_naive
+                    else:
+                        no_of_bins = None
+                    binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df, col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre, tolerence=2, flag='ignore')
+                    binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str))
+                    binned_data_col_cat.drop('column_name', axis=1, inplace=True)
+                    fig_cat = plot_chart(binned_data_col_cat, col, dep_var=None)
+                    st.plotly_chart(fig_cat, use_container_width=True)
+    st.divider()
+    # Combine numerical and categorical binned data into one dataframe
+    binned_data_combined = pd.DataFrame()
+    # Process numerical columns
+    for col in st.session_state.numerical_columns:
+        if function_num == 'tree':
+            depth = depth
+        else:
+            depth=None
+        if function_num == 'value':
+            num_bins=num_bins
+        else:
+            num_bins=None
+        # Your code to create numerical binned data
+        binned_data_num = create_numerical_binned_data1(st.session_state.imputed_df, col, function_num, num_bins, st.session_state.flag, depth)
+        binned_data_combined = pd.concat([binned_data_combined, binned_data_num], axis=1)
+    # Process categorical columns
+    for col in st.session_state.categorical_columns:
+        if function_cat == 'woe_iv':
+            max_thre = max_slider
+            min_thre = min_slider
+            no_of_bins = cat_bins_iv
+        else:
+            max_thre = None
+            min_thre = None
+            no_of_bins = None
+        if function_cat == 'naive':
+            no_of_bins = cat_bins_naive
+        else:
+            no_of_bins=None
+        # Your code to create categorical binned data
+        binned_data_cat = create_categorical_binned_data1(st.session_state.imputed_df, col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre, tolerence=2, flag='ignore')
+        binned_data_combined = pd.concat([binned_data_combined, binned_data_cat], axis=1)
+    def clean_column_name(column_name):
+        # Replace special characters with underscores except for the decimal point
+        return re.sub(r'\.(\d+)', '', column_name)
+    binned_data_combined.columns = binned_data_combined.columns.map(clean_column_name)
+    valid_feature_names = [name.replace('[', '').replace(']', '').replace('<', '').replace(',', '_').replace('(', '').replace("'", '') for name in binned_data_combined.columns]
+    valid_feature_names = [name.replace(' ', '').replace('  ', '') for name in valid_feature_names]
+    binned_data_combined.columns = valid_feature_names
+    # Display the combined binned data dataframe
+    st.session_state.binned_df = binned_data_combined
+    st.session_state.binned_df[st.session_state.flag]=st.session_state.imputed_df[st.session_state.flag]
+    st.session_state.binned_df.insert(0, st.session_state.identifier, st.session_state.imputed_df[st.session_state.identifier])
+    print(st.session_state.binned_df['individual_id_ov'])
+    #st.session_state.binned_df[st.session_state.identifier]=st.session_state.imputed_df[st.session_state.identifier]
+    st.markdown("Binned DataFrame")
+    st.dataframe(binned_data_combined.head(10), use_container_width=True, hide_index=True)
+    # Add a button to download the binned dataframe
+    if st.session_state.binned_df is not None:
+        #with st.expander("Download Binned Data"):
+        download_button = st.download_button(
+            label="Download Binned Data as CSV",
+            data=st.session_state.binned_df.to_csv(index=False).encode(),
+            file_name='binned_data.csv',
+            mime='text/csv',
+        )
+    # Create a button to download the DataFrame as CSV
+#if st.button("Download Binned Data"):
+#    binned_csv = binned_df.to_csv(index=False)
+#   b64 = base64.b64encode(binned_csv.encode()).decode()
+#    href = f'<a href="data:file/csv;base64,{b64}" download="binned_data.csv">Download Binned Data CSV File</a>'
+#   st.markdown(href, unsafe_allow_html=True)
+    # def download_button(data, file_name, button_text):
+    #     csv = data.to_csv(index=False).encode()
+    #     href = f'<a href="data:file/csv;base64,{csv.decode()}" download="{file_name}">{button_text}</a>'
+    #     st.markdown(href, unsafe_allow_html=True)
+    # # Add the download button
+    # download_button(binned_data_combined, 'data.csv', 'Download CSV')
+    # with st.expander("Profiling summary: Plots"):
+    #     st.markdown(
+    #              "<p class='plot-header'>Change the selected variable to plot"
+    #              " different charts</p>",
+    #              unsafe_allow_html=True,
+    #         )
+    #     st.write("Numerical binned data plots")
+    #     for col in st.session_state.numerical_columns:
+    #         if function_num == 'tree':
+    #             depth = depth
+    #         else:
+    #             depth=None
+    #         if function_num == 'value':
+    #             num_bins=num_bins
+    #         else:
+    #             num_bins=None
+    #         binned_data_col = create_numerical_binned_data(st.session_state.imputed_df, col, function_num,num_bins,st.session_state.flag, depth)
+    #         binned_data_col.insert(0, 'column_bin', col + '_' + binned_data_col['bin_ranges'].astype(str))
+    #         fig=plot_chart(binned_data_col, col, dep_var=None)
+    #         st.plotly_chart(fig, use_container_width=False)
+    #     st.write("Categorical binned data plots")
+    #     for col in st.session_state.categorical_columns:
+    #         if function_cat == 'woe_iv':
+    #             max_thre = max_slider
+    #             min_thre = min_slider
+    #             no_of_bins = cat_bins_iv
+    #         else:
+    #             max_thre = None
+    #             min_thre = None
+    #             no_of_bins = None
+    #         if function_cat == 'naive':
+    #             no_of_bins = cat_bins_naive
+    #         else:
+    #             no_of_bins=None
+    #         binned_data_col_cat = create_categorical_binned_data(st.session_state.imputed_df,col, function_cat, st.session_state.flag, no_of_bins=no_of_bins, max_thre=max_thre, min_thre=min_thre,tolerence=2, flag='ignore')
+    #         binned_data_col_cat.insert(0, 'column_bin', col + '_' + binned_data_col_cat['values in bin'].astype(str))
+    #         binned_data_col_cat.drop('column_name',axis=1,inplace=True)
+    #         fig_cat = plot_chart(binned_data_col_cat, col, dep_var=None)
+    #         st.plotly_chart(fig_cat, use_container_width=False)

pages/pages/3_Point estimates.py ADDED Viewed

	@@ -0,0 +1,369 @@

+###### SUPER SAFE ######
+import pandas as pd
+import numpy as np
+import streamlit as st
+import pandas as pd
+import numpy as np
+import seaborn as sn
+import matplotlib.pyplot as plt
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+from sklearn.metrics import confusion_matrix, classification_report
+from sklearn.model_selection import train_test_split
+import xgboost as xgb
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+import numpy as np
+import plotly.figure_factory as ff
+st.set_page_config(
+    layout="wide",
+)
+def point_estimates(df, model_type, flag, identifier, control_sample_size, solver=None, max_iter=None, class_weights=None, max_depth=None, subsample=None, eta=None):
+    # if set(df[df[flag] == 0][identifier]).intersection(set(df[df[flag] == 1][identifier])):
+    #     st.error("The identifier should not be common between flag values 0 and 1.")
+    Xs = df.drop(columns=[identifier, flag],axis=1)
+    X_scaled = StandardScaler().fit_transform(Xs)
+    n_comp = len(Xs.columns)
+    pca = PCA(n_components=n_comp)
+    pca.fit(X_scaled)
+    princ_comp = pca.transform(X_scaled)
+    PCA_DF = pd.DataFrame(princ_comp)
+    pca_var = pca.explained_variance_ratio_[0:n_comp].cumsum()
+    idx = [i for i in range(len(pca_var)) if pca_var[i] > 0.995][0]
+    df_pca = PCA_DF.loc[:, 0:idx]
+    df_pca[flag]=df[flag]
+    print(df_pca)
+    #creating train and control datasets
+    df_train = df_pca[df_pca[flag] == 1]
+    df_control = df_pca[df_pca[flag] == 0]
+    df_control_sample = df_control.sample(n=control_sample_size, random_state=42)
+    final_df_sample = pd.concat([df_train, df_control_sample], ignore_index=True)
+    non_req_cols=[flag]
+    req_cols=df_pca.columns[~df_pca.columns.isin(non_req_cols)]
+    # create a holdout set
+    identifier_df, X, y = df[[identifier]], final_df_sample[req_cols], final_df_sample[[flag]]
+    if model_type == 'linear':
+        # scale features
+        # min_max_scaler = MinMaxScaler()
+        # X_norm = min_max_scaler.fit_transform(X)
+        #X_norm = (X - X.min()) / (X.max() - X.min())
+        # fit model
+        model = LogisticRegression(solver=solver, max_iter=max_iter, class_weight=class_weights)
+        model.fit(X, y)
+        #feature importances
+        coefs = model.coef_[0]
+        feats = X.columns
+        importance_df = pd.DataFrame({'features':feats, 'coefficients':coefs})
+        importance_df['abs_coef'] = np.abs(importance_df['coefficients'])
+    elif model_type == 'xgboost':
+        model = xgb.XGBClassifier(max_depth=max_depth, subsample=subsample, eta=eta)
+        model.fit(X, y)
+        importance = model.feature_importances_
+        feats = X.columns
+        importance_df = pd.DataFrame({'features':feats, 'Importance':importance})
+    #Prediction
+    Y_pred = model.predict(X)
+    #Confusion matrix
+    #cm = confusion_matrix(y, Y_pred)/y.shape[0]
+    cm = confusion_matrix(y, Y_pred) / len(y)
+    # Create DataFrame for confusion matrix
+    classes = np.unique(y)
+    df_cm = pd.DataFrame(cm, index=classes, columns=classes)
+    # Create hover text
+    hover_text = [['Actual: {}<br>Predicted: {}<br>Value: {:.2f}'.format(y.iloc[i, 0], Y_pred[i], cm[i, j])
+                for j in range(len(classes))] for i in range(len(classes))]
+    # Create heatmap using Plotly with hover text
+    fig = ff.create_annotated_heatmap(z=df_cm.values,
+                                    x=list(classes),
+                                    y=list(classes),
+                                    colorscale='blues',
+                                    hoverinfo='text',
+                                    text=hover_text)
+    # Update heatmap layout
+    fig.update_layout(
+        title='Confusion Matrix',
+        xaxis_title='Predicted',
+        yaxis_title='Actual',
+        font=dict(size=14)
+    )
+    # Display Plotly figure in Streamlit
+    #st.plotly_chart(fig)
+    #classification report
+    report = classification_report(y, Y_pred, output_dict=True)
+    # Convert the classification report to a DataFrame
+    report_df = pd.DataFrame(report).transpose()
+    # prep data
+    X, y = df_pca[req_cols], df_pca[[flag]]
+    #X, y = df.drop(columns=[flag,identifier]), df[[flag]]
+    # scale features
+    # min_max_scaler = MinMaxScaler()
+    # X_norm = min_max_scaler.fit_transform(X)
+    #X_norm = (X - X.min()) / (X.max() - X.min())
+    # run inference
+    y_pred_proba = model.predict_proba(X)
+    y_pred_df = pd.DataFrame(y_pred_proba)
+    df_pca.insert(0, 'propensity_score', y_pred_df[1])
+    # df_pca[identifier] = identifier_df
+    # df_pca[identifier]=df_pca[identifier].astype('str')
+    # Display classification report
+    st.subheader("Classification Report")
+    st.dataframe(report_df,width=600)
+    # Display confusion matrix
+    # st.subheader("Confusion Matrix")
+    # st.write(df_cm,width=600)
+    # Display confusion matrix
+    st.subheader("Confusion matrix")
+    st.plotly_chart(fig)
+    return df_pca[['propensity_score']]
+# if 'df' in st.session_state:
+#     task_type = st.sidebar.selectbox("Task Type", ["classification", "regression"],key="task_type")
+#     model_type = st.sidebar.selectbox("Model Type", ["linear", "xgboost"])
+#     flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns))
+#     identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns))
+#     st.sidebar.write("Applicable only for Regression model type")
+#     dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
+#     st.session_state.flag=flag
+#     st.session_state.identifier=identifier
+#     # Sidebar for user inputs
+#     if flag is not None:
+#         with st.expander("Model Configuration", expanded=True):
+#                 unique_flag_values = st.session_state.df[flag].unique()
+#                 for value in unique_flag_values:
+#                     st.write(f"Y == {value}: {len(st.session_state.df[st.session_state.df[flag] == value])}")
+#                 control_sample_size = st.text_input("Control Sample Size")
+#                 try:
+#                     # Try converting to an integer
+#                     control_sample_size = int(control_sample_size)
+#                     # Check if control_sample_size is within the valid range
+#                     flag_0_size = len(st.session_state.df[st.session_state.df[flag] == 0])
+#                     if control_sample_size < 0 or control_sample_size > flag_0_size:
+#                         st.error(f"Control Sample Size must be between 0 and {flag_0_size}.")
+#                 except ValueError:
+#                     st.error("Please enter a valid integer for Control Sample Size.")
+#                 #st.write("Applicable only for Regression model type")
+#                 #if st.session_state.get("task_type","") == "regression":
+#                     #dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns))
+#                 point_estimate_variable = st.text_input("Variable of interest")
+#                 st.session_state.point_estimate_variable=point_estimate_variable
+#                 if st.button("Run Modeling"):
+#                     result_df = point_estimates(st.session_state.df, task_type, model_type, point_estimate_variable, control_sample_size, flag, identifier, dep_var)
+#                     st.session_state.modeling_df = result_df
+#                     st.session_state.treated_df=result_df[result_df['Y']==1]
+#                     st.session_state.non_treated_df=result_df[result_df['Y']==0]
+st.title("Algorithms")
+#st.subheader("Classification")  # Added line
+#classification_option = st.radio("Classification", ["Classification"])  # Added line
+if 'classification_option' not in st.session_state:
+    st.session_state.classification_option = "Classification"
+if 'algorithm_option' not in st.session_state:
+    st.session_state.algorithm_option = "Logistic Regression"
+classification_option = st.radio("Algorithm Type", ["Classification", "Regression"], key="classification_option")
+if classification_option != st.session_state.classification_option:
+    st.session_state.classification_option = classification_option
+if st.session_state.classification_option == "Classification":
+    col1, col2 = st.columns(2)
+    with col1:
+        st.write("#####")
+        lr_checkbox = st.checkbox(
+            label="Logistic Regression",
+            key="algorithm_lr_cb",
+            value=(st.session_state.algorithm_option == "Logistic Regression")
+        )
+    with col2:
+        st.write("#####")
+        show_lr_options = st.checkbox(
+            label="Change default options",
+            key="lr_options_cb",
+            disabled=not lr_checkbox,
+        )
+    cols = st.columns((2, 1))
+    with cols[0]:
+        lr_hyp_placeholder = st.empty()
+        lr_model_placeholder = st.empty()
+    solver='lbfgs'
+    class_weights=None
+    max_iter=1000
+    if show_lr_options and lr_checkbox:
+        with lr_hyp_placeholder:
+            with st.expander("LR parameters"):
+                solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
+                max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
+                class_weight_option = st.selectbox(
+                    'Select class weights option:',
+                    ('Custom', 'Balanced')
+                )
+                if class_weight_option == 'Custom':
+                    weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
+                    weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
+                    class_weights = {1: weight_1, 0: weight_0}
+                elif class_weight_option == 'Balanced':
+                    class_weights = {1: 0.5, 0: 0.5}
+            #control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
+    col1, col2 = st.columns(2)
+    with col1:
+        st.write("#####")
+        xgb_checkbox = st.checkbox(
+            label="Xgboost Classifier", key="algorithm_xgb_cb",
+            value=(st.session_state.algorithm_option == "Xgboost Classifier")
+        )
+    with col2:
+        st.write("#####")
+        show_xgb_options = st.checkbox(
+            label="Change default options",
+            key="xgb_options_cb",
+            disabled=not xgb_checkbox,
+        )
+    cols = st.columns((2, 1))
+    with cols[0]:
+        xgb_hyp_placeholder = st.empty()
+    max_depth=None
+    subsample=None
+    eta=None
+    if show_xgb_options and xgb_checkbox:
+        with xgb_hyp_placeholder:
+            with st.expander("XGB hyper parameters"):
+                max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
+                subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
+                eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
+            #control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
+    st.session_state.algorithm_option = "Logistic Regression" if lr_checkbox else "Xgboost Classifier"
+elif classification_option == "Regression":
+    col1, col2 = st.columns(2)
+    with col1:
+        st.write("#####")
+        lr_checkbox = st.checkbox(
+            label="Linear Regression",
+            key="algorithm_lr_cb",
+            value=(st.session_state.algorithm_option == "Linear Regression")
+        )
+    with col2:
+        st.write("#####")
+        show_lr_options = st.checkbox(
+            label="Change default options",
+            key="lr_options_cb",
+            disabled=not lr_checkbox,
+        )
+    cols = st.columns((2, 1))
+    with cols[0]:
+        lr_hyp_placeholder = st.empty()
+        lr_model_placeholder = st.empty()
+    solver='lbfgs'
+    class_weights=None
+    max_iter=1000
+    if show_lr_options and lr_checkbox:
+        with lr_hyp_placeholder:
+            with st.expander("LR parameters"):
+                solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag'])
+                max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000)
+                class_weight_option = st.selectbox(
+                    'Select class weights option:',
+                    ('Custom', 'Balanced')
+                )
+                if class_weight_option == 'Custom':
+                    weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1)
+                    weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1)
+                    class_weights = {1: weight_1, 0: weight_0}
+                elif class_weight_option == 'Balanced':
+                    class_weights = {1: 0.5, 0: 0.5}
+    col1, col2 = st.columns(2)
+    with col1:
+        st.write("#####")
+        xgb_checkbox = st.checkbox(
+            label="Xgboost Regression", key="algorithm_xgb_cb",
+            value=(st.session_state.algorithm_option == "Xgboost Regression")
+        )
+    with col2:
+        st.write("#####")
+        show_xgb_options = st.checkbox(
+            label="Change default options",
+            key="xgb_options_cb",
+            disabled=not xgb_checkbox,
+        )
+    cols = st.columns((2, 1))
+    with cols[0]:
+        xgb_hyp_placeholder = st.empty()
+    max_depth=None
+    subsample=None
+    eta=None
+    if show_xgb_options and xgb_checkbox:
+        with xgb_hyp_placeholder:
+            with st.expander("XGB hyper parameters"):
+                max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1)
+                subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1)
+                eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01)
+    st.session_state.algorithm_option = "Linear Regression" if lr_checkbox else "Xgboost Regression"
+with cols[0]:
+    control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1]))
+#st.subheader("Classification")  # Added line
+#classification_option = st.radio("Classification", ["Classification"])  # Added line
+if st.button("Run Modeling"):
+    if lr_checkbox:
+        st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='linear',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,solver=solver,max_iter=max_iter,class_weights=class_weights)
+    elif xgb_checkbox:
+        st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='xgboost',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,max_depth=max_depth, subsample=subsample, eta=eta)
+    # st.session_state.binned_df['propensity_score'] = result_df['propensity_score']
+    st.session_state.treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 1]
+    st.session_state.non_treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 0]

pages/pages/4_Matching & Diagnostics.py ADDED Viewed

	@@ -0,0 +1,490 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+from sklearn.preprocessing import StandardScaler
+import xgboost as xgb
+import base64
+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import StandardScaler
+from sklearn.neighbors import NearestNeighbors
+from math import sqrt
+from statistics import mean, variance
+import seaborn as sns
+import plotly.graph_objects as go
+def cohend_plot_function(std_mean_diff_df2, std_mean_diff_df, selected_attributes):
+    # Create subplot of selected attributes
+    fig = go.Figure()
+    x = std_mean_diff_df2[std_mean_diff_df2["Metrics"].isin(selected_attributes)]["Cohend Value"][::-1]
+    y = list(std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Metrics"][::-1])
+    x1 = std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Cohend Value"][::-1]
+    y1 = list(std_mean_diff_df[std_mean_diff_df["Metrics"].isin(selected_attributes)]["Metrics"][::-1])
+    # Add traces
+    fig.add_trace(go.Scatter(
+        x=x,
+        y=y,
+        mode='markers',
+        marker=dict(color='blue'),
+        name='general_control_cohend'
+    ))
+    fig.add_trace(go.Scatter(
+        x=x1,
+        y=y1,
+        mode='markers',
+        marker=dict(color='orange', symbol='diamond-open'),
+        name='synthetic_control_cohend'
+    ))
+    # Add vertical lines
+    for val in [-0.1, 0.1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75]:
+        fig.add_shape(
+            type="line",
+            x0=val,
+            y0=0,
+            x1=val,
+            y1=10,
+            line=dict(
+                color="gray",
+                width=1,
+                dash="dash",
+            )
+        )
+    # Add vertical line at x=0
+    fig.add_shape(
+        type="line",
+        x0=0,
+        y0=0,
+        x1=0,
+        y1=10,
+        line=dict(
+            color="black",
+            width=1,
+        )
+    )
+    # Update layout
+    fig.update_layout(
+        xaxis=dict(
+            title='cohend',
+            range=[-1, 1]
+        ),
+        yaxis=dict(
+            title='Metrics',
+            autorange="reversed"
+        ),
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=1.02,
+            xanchor="right",
+            x=1
+        )
+    )
+    # Show
+    st.plotly_chart(fig,use_container_width=True)
+def plot_comparison(comparison_df):
+    fig = go.Figure()
+    # Add bars for treatment and control values
+    fig.add_trace(go.Bar(
+        x=comparison_df.index,
+        y=comparison_df[comparison_df.columns[0]],
+        name='Treatment',
+        marker=dict(color='#053057'),
+    ))
+    fig.add_trace(go.Bar(
+        x=comparison_df.index,
+        y=comparison_df[comparison_df.columns[1]],
+        name='Control',
+        marker=dict(color='#8ac4f8'),
+    ))
+    # Update layout
+    fig.update_layout(
+        xaxis=dict(
+            title='quartiles'
+        ),
+        yaxis=dict(
+            title='values'
+        ),
+        barmode='group',
+        title=comparison_df.columns[0].split('treatment')[1][1:]
+    )
+    # Show
+    st.plotly_chart(fig,use_container_width=True)
+def plot_propensity_distribution(treatment_data, control_data):
+    fig = go.Figure()
+    # Add histograms for treatment and control data
+    fig.add_trace(go.Histogram(
+        x=treatment_data,
+        name='Treatment',
+        marker=dict(color='#053057'),
+        opacity=0.6
+    ))
+    fig.add_trace(go.Histogram(
+        x=control_data,
+        name='Control',
+        marker=dict(color='#8ac4f8'),
+        opacity=0.6
+    ))
+    # Update layout
+    fig.update_layout(
+        xaxis=dict(
+            title='propensity_score'
+        ),
+        yaxis=dict(
+            title='count'
+        ),
+        barmode='overlay',
+        title='Propensity Distribution'
+    )
+    # Show
+    st.plotly_chart(fig,use_container_width=True)
+def comparison(df, variable):
+    # generates a comparison df for any given feature
+    treatment_values = df[df.Y==1].groupby('quartiles')[variable].mean()
+    control_values = df[df.Y==0].groupby('quartiles')[variable].mean()
+    comparison = pd.merge(treatment_values, control_values, left_index=True, right_index=True)
+    comparison.rename({f'{variable}_x': f'treatment_{variable}', f'{variable}_y': f'control_{variable}'}, axis=1, inplace=True)
+    comparison['difference'] = np.abs(comparison[f'treatment_{variable}'] - comparison[f'control_{variable}'])
+    comparison['percent_difference'] = np.abs((comparison[f'treatment_{variable}'] - comparison[f'control_{variable}']) / comparison[f'treatment_{variable}'])
+    return comparison
+# Function to calculate Cohen's d for independent samples
+def cohend(d1, d2):
+    n1, n2 = len(d1), len(d2)
+    s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
+    s = sqrt(((n1-1) * s1 + (n2-1) * s2) / (n1 + n2 - 2))
+    u1, u2 = mean(d1), mean(d2)
+    # Check if the standard deviation is zero
+    if s == 0:
+        return 0  # Return 0 when the denominator is zero
+    else:
+        return (u1 - u2) / s
+# Function to calculate standardized mean differences
+def std_mean_diff(group_A_df, group_B_df):
+    cohend_values_arr = [0] * len(group_A_df.columns)
+    for i in range(len(group_A_df.columns)):
+        cohend_values_arr[i] = cohend(group_A_df[group_A_df.columns[i]], group_B_df[group_A_df.columns[i]])
+    cohend_array_pre_transp = [group_A_df.columns, cohend_values_arr]
+    np_array = np.array(cohend_array_pre_transp)
+    cohend_array = np.transpose(np_array)
+    return cohend_array
+# Function to get matched IDs and calculate Cohen's d values
+def cohend_code_function(binned_df, matching_df):
+    treat_df_complete = binned_df[binned_df['Y'] == 1]
+    control_df_complete = binned_df[binned_df['Y'] == 0]
+    treat_df_complete.drop('Y', axis =1, inplace = True)
+    control_df_complete.drop('Y', axis =1, inplace = True)
+    treatment_cust = pd.DataFrame()
+    control_cust = pd.DataFrame()
+    treatment_cust['individual_id_ov'] = matching_df["Id"]
+    control_cust['individual_id_ov'] = matching_df["matched_Id"]
+    #getting cohend values for synthetic control population
+    group_A_df = treatment_cust[['individual_id_ov']]
+    group_A_df = group_A_df.merge(treat_df_complete,
+                                            how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
+    group_B_df = control_cust[['individual_id_ov']]
+    group_B_df = group_B_df.merge(control_df_complete,
+                                             how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
+    group_A_df.drop('individual_id_ov', axis =1, inplace = True)
+    group_B_df.drop('individual_id_ov', axis =1, inplace = True)
+    cohensd_df = std_mean_diff(group_A_df, group_B_df)
+    std_mean_diff_df = pd.DataFrame(columns=["Metrics","Cohend Value"])
+    for i in range(len(cohensd_df)):
+        std_mean_diff_df.loc[len(std_mean_diff_df.index)] = [cohensd_df[i][0],round(float(cohensd_df[i][1]),2)]
+    std_mean_diff_df["flag"] = std_mean_diff_df.apply(lambda x : 1 if (x["Cohend Value"]>0.1 or x["Cohend Value"]<-0.1) else 0, axis =1)
+    st.write('Number of variables with standard mean difference between treatment and control is out of desired range (-0.1, 0.1): ', std_mean_diff_df["flag"].sum())
+    # Download cohend output table
+    st.write(std_mean_diff_df)
+    #getting cohend values for General population
+    group_A_df = treatment_cust[['individual_id_ov']]
+    group_A_df = group_A_df.merge(treat_df_complete,
+                                            how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
+    group_B_df = control_df_complete[['individual_id_ov']]
+    group_B_df = group_B_df.merge(control_df_complete,
+                                             how = 'left',right_on='individual_id_ov',left_on='individual_id_ov')
+    group_A_df.drop('individual_id_ov', axis =1, inplace = True)
+    group_B_df.drop('individual_id_ov', axis =1, inplace = True)
+    cohensd_df = std_mean_diff(group_A_df, group_B_df)
+    std_mean_diff_df2 = pd.DataFrame(columns=["Metrics","Cohend Value"])
+    for i in range(len(cohensd_df)):
+        std_mean_diff_df2.loc[len(std_mean_diff_df2.index)] = [cohensd_df[i][0],round(float(cohensd_df[i][1]),2)]
+    return std_mean_diff_df2, std_mean_diff_df
+def calculate_iv(df, flag, identifier):
+    df1 = df.drop([flag, identifier, 'propensity_score'], axis=1)
+    iv_df = pd.DataFrame(columns=['Feature', 'IV'])
+    for column in df1.columns:
+        data = pd.concat([pd.qcut(df1[column], q=10, duplicates='drop'), df[flag]], axis=1)
+        groups = data.groupby(by=column)[df[flag].name].agg(['count', 'sum'])
+        groups['event_rate'] = groups['sum'] / groups['count']
+        groups['non_event_rate'] = (groups['count'] - groups['sum']) / groups['count']
+        groups['WOE'] = np.log(groups['event_rate'] / groups['non_event_rate'])
+        groups['IV'] = (groups['event_rate'] - groups['non_event_rate']) * groups['WOE']
+        iv = groups['IV'].sum()
+        iv_df = pd.concat([iv_df, pd.DataFrame({'Feature': [column], 'IV': [iv]})],axis=0, ignore_index=True)
+    return iv_df
+def xgboost_feature_importance(df, flag,identifier):
+    X, y = df.drop([flag,identifier,'propensity_score'],axis=1), df[[flag]]
+    model = xgb.XGBClassifier()
+    model.fit(X, y)
+    importances = model.feature_importances_
+    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
+    importance_df = importance_df.sort_values(by='Importance', ascending=False)
+    return importance_df
+# iv_result = calculate_iv(df_features, df_target)
+# importance_result = xgboost_feature_importance(df_features, df_target)
+def get_matching_pairs(identifier,treated_df, non_treated_df, sample_size_A, sample_size_B,matching_columns,flag):
+    # if treated_df[identifier].isna().any() or non_treated_df[identifier].isna().any():
+    #     st.error("The identifier should not contain Nan's")
+    treated_df = treated_df[matching_columns].sample(frac=sample_size_A/100)
+    non_treated_df = non_treated_df[matching_columns].sample(frac=sample_size_B/100)
+    treated_df = treated_df.set_index(st.session_state.identifier)
+    treated_df.drop(flag,axis=1,inplace=True)
+    non_treated_df = non_treated_df.set_index(st.session_state.identifier)
+    non_treated_df.drop(flag,axis=1,inplace=True)
+    treated_x = treated_df.values
+    non_treated_x = non_treated_df.values
+    scaler = StandardScaler()
+    scaler.fit(treated_x)
+    treated_x = scaler.transform(treated_x)
+    non_treated_x = scaler.transform(non_treated_x)
+    print("data transformaion completed")
+    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(non_treated_x)
+    print("model fitting completed")
+    distances, indices = nbrs.kneighbors(treated_x)
+    print("matching completed")
+    indices = indices.reshape([1,indices.shape[0]*indices.shape[1]])
+    res = []
+    for i in list(treated_df.index):
+        for ele in range(1):
+            res.append(i)
+    output_df = pd.DataFrame()
+    output_df["Id"] = res
+    output_df["matched_Id"] = non_treated_df.iloc[indices[0]].index
+    return output_df
+# Streamlit App
+st.title("Matching")
+# Calculate IV
+iv_df = calculate_iv(st.session_state.binned_df, st.session_state.flag, st.session_state.identifier)
+# Calculate XGBoost feature importance
+importance_df = xgboost_feature_importance(st.session_state.binned_df, st.session_state.flag, st.session_state.identifier)
+# Combine IV and feature importance into a final DataFrame
+combined_df = pd.merge(iv_df, importance_df, on='Feature', suffixes=('_iv', '_importance'))
+combined_df['Avg_IV_Importance'] = (combined_df['IV'] + combined_df['Importance']) / 2
+combined_df.sort_values('Avg_IV_Importance',inplace=True,ascending=False)
+# Add the 'Select' column with checkboxes
+combined_df.insert(0, 'Select', False)
+combined_df.reset_index(drop=True,inplace=True)
+# Display the feature importances
+st.subheader("Feature importances")
+st.session_state["edited_df_combined"] = st.data_editor(
+    combined_df.style.hide(axis="index"),
+    column_config={
+        "Select": st.column_config.CheckboxColumn(required=True)
+    },
+    disabled=combined_df.drop("Select", axis=1).columns,use_container_width=True
+)
+# Allow users to enter the number of top features they want to select
+top_features_input = st.number_input("Enter the number of top features", min_value=1, max_value=len(combined_df), value=None)
+if top_features_input is not None:
+    # Select the top features based on user input
+    selected_df = combined_df.head(top_features_input)
+    selected_features = selected_df['Feature'].tolist()
+else:
+    # Check if any features are selected via checkboxes
+    selected_features = st.session_state.edited_df_combined[st.session_state.edited_df_combined['Select']]['Feature'].tolist()
+    # Determine the selected features based on user input
+    #selected_features = checkbox_selected_features if checkbox_selected_features else selected_features
+selected_features.append(st.session_state.identifier)
+selected_features.append(st.session_state.flag)
+# Update the session state with the selected features
+st.session_state.selected_features = selected_features
+with st.expander("Matching Inputs",expanded=True):
+    st.write("Matching Inputs")
+    ui_columns = st.columns((1, 1))
+    with ui_columns[0]:
+        sample_size_A = st.slider("Sample Size for treatment Group", 1, 100, 100)
+    with ui_columns[1]:
+        sample_size_B = st.slider("Sample Size for Control Group", 1, 100, 100)
+    with ui_columns[0]:
+        st.write("#")
+        run_matching = st.button(
+            label="Run Matching"
+        )
+st.divider()
+if run_matching:
+    matching_df = get_matching_pairs(st.session_state.identifier,st.session_state.treated_df, st.session_state.non_treated_df, sample_size_A, sample_size_B,st.session_state.selected_features,st.session_state.flag)
+    st.session_state.matching_df = matching_df
+    # Display the result
+    st.dataframe(st.session_state.matching_df)
+    if st.session_state.matching_df is not None:
+        #with st.expander("Download Matching DF"):
+        download_button = st.download_button(
+            label="Download Matched Data as CSV",
+        data=st.session_state.matching_df.to_csv(index=False).encode(),
+            file_name='matching_data.csv',
+            mime='text/csv',
+        )
+# if 'matching_df' not in st.session_state:
+#     st.session_state.matching_df = False
+st.subheader("Matching diagnostics")
+control_group = st.session_state.binned_df[st.session_state.binned_df[st.session_state.identifier].isin(st.session_state.matching_df['matched_Id'])]
+treatment_group = st.session_state.binned_df[st.session_state.binned_df.Y==1]
+#create combined group and add ventiles
+combined_group = pd.concat([control_group, treatment_group])
+combined_group['quartiles'] = pd.qcut(combined_group['propensity_score'], 4, labels=False)
+combined_group.drop(st.session_state.identifier,axis=1,inplace=True)
+st.session_state.combined_group=combined_group
+if 'perform_diagnostics' not in st.session_state:
+    st.session_state.perform_diagnostics = False
+# Display button
+perform_diagnostics = st.button(label="Run Diagnostics")
+if perform_diagnostics or st.session_state.perform_diagnostics:
+    st.session_state.perform_diagnostics = True
+    with st.expander("Matching Diagnostics", expanded=True):
+        left, right = st.columns(2)
+        std_mean_diff_df2,std_mean_diff_df = cohend_code_function(st.session_state.binned_df, st.session_state.matching_df)
+        st.subheader("Cohen's d Plot")
+        cohend_plot_function(std_mean_diff_df2,std_mean_diff_df, selected_features)
+        # Pre-matching Propensity Distribution
+        st.subheader("Pre-matching Propensity Distributions")
+        plot_propensity_distribution(st.session_state.binned_df[st.session_state.binned_df.Y == 1]['propensity_score'], st.session_state.binned_df[st.session_state.binned_df.Y == 0]['propensity_score'])
+        # Post-matching Propensity Distribution
+        st.subheader("Post-matching Propensity Distributions")
+        temp = pd.merge(left=st.session_state.matching_df, right=st.session_state.binned_df[[st.session_state.identifier, 'propensity_score']], left_on='Id', right_on=st.session_state.identifier, how='left')
+        temp.drop(st.session_state.identifier, axis=1, inplace=True)
+        temp.rename({'Id': 'treatment_id', 'matched_Id': 'control_id', 'propensity_score': 'treatment_propensity'}, axis=1, inplace=True)
+        temp = pd.merge(left=temp, right=st.session_state.binned_df[[st.session_state.identifier, 'propensity_score']], left_on='control_id', right_on=st.session_state.identifier, how='left')
+        temp.drop(st.session_state.identifier, axis=1, inplace=True)
+        temp.rename({'propensity_score': 'control_propensity'}, axis=1, inplace=True)
+        plot_propensity_distribution(temp['treatment_propensity'],temp['control_propensity'])
+    with st.expander("Comparison Plots",expanded=True):
+        st.markdown(
+            "<p class='plot-header'>Change the selected variable to plot"
+            " different charts</p>",
+            unsafe_allow_html=True,
+        )
+        left, right = st.columns(2)
+        with left:
+            if 'selected_variable_comp' not in st.session_state:
+                st.session_state.selected_variable_comp = []  # Initialize selected_variable
+            selected_variable_comp = st.multiselect(
+                "Variable",
+                st.session_state.combined_group.columns,
+                st.session_state.selected_variable_comp  # Set the default value to the stored session state
+            )
+            # Update session state with selected variable
+            st.session_state.selected_variable_comp = selected_variable_comp
+        if st.session_state.selected_variable_comp:
+            # Plot comparisons for selected variables
+            comparisons = {}
+            for var in st.session_state.selected_variable_comp:
+                comparisons[var] = comparison(combined_group, var)
+                plot_comparison(comparisons[var])
+# selected_variables = st.multiselect("Select variables for comparison", combined_group.columns)
+# if selected_variables:
+# # Plot comparisons for selected variables
+#     comparisons = {}
+#     for var in selected_variables:
+#         comparisons[var] = comparison(combined_group, var)
+#         plot_comparison(comparisons[var])

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+dash==2.9.3
+dash_auth==2.0.0
+dash_bootstrap_components==1.4.1
+holidays==0.24
+hyperopt==0.2.7
+joblib==1.2.0
+matplotlib==3.5.1
+mdutils==1.5.0
+numpy==1.22.4
+openpyxl==3.0.10
+openpyxl_image_loader==1.0.5
+pandas==1.5.2
+# Pillow==9.4.0
+Pillow==10.2.0
+plotly==5.14.1
+pmdarima==2.0.2
+prophet==1.1.2
+python-dotenv==1.0.0
+# pytz==2022.7.1
+pytz==2022.7
+scikit_learn==1.2.2
+scipy==1.7.3
+seaborn==0.11.2
+shap==0.41.0
+statsmodels==0.13.5
+streamlit==1.27.2
+streamlit-aggrid==0.3.4.post3
+sweetviz==2.3.1
+waitress==2.1.2
+xgboost==1.6.2

styles.css ADDED Viewed

	@@ -0,0 +1,58 @@

+html {
+    margin: 0;
+}
+#MainMenu {
+    visibility: collapse;
+}
+footer {
+    visibility: collapse;
+}
+div.block-container{
+    padding: 2rem 3rem;
+}
+.main-header {
+    display: flex;
+    flex-direction: row;
+    justify-content: space-between;
+    align-items: center;
+}
+.main-header > img {
+    max-height: 96px;
+    /* max-width: 300px; */
+    object-fit: cover;
+}
+button  div {
+    overflow: hidden;
+    text-overflow:ellipsis;
+    white-space: nowrap;
+}
+h1 {
+    color: #053057;
+}
+hr {
+   height: 10px !important;
+   color: #053057;
+}
+p.plot-header {
+    font-size: small;
+    font-weight: bold;
+}
+hr {
+    margin: 0 0 10 0;
+    padding: 0;
+}