File size: 14,839 Bytes
91e0a48
d89f303
 
 
 
 
 
e8ef7ba
d89f303
6d3f583
91e0a48
3430dd0
 
d89f303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17ef366
fbe48a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53b8179
fbe48a0
 
 
d89f303
 
 
 
 
 
 
 
17ef366
d89f303
 
 
e8c4461
d89f303
 
 
 
 
 
 
76275be
d89f303
 
76275be
a0d26a9
becf373
a0d26a9
becf373
 
 
 
a0d26a9
becf373
a0d26a9
 
 
e842846
a0d26a9
d89f303
 
76275be
d89f303
76275be
d89f303
76275be
d89f303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8c4461
76275be
e8c4461
 
76275be
 
 
 
 
 
 
 
 
 
 
e8c4461
76275be
e8c4461
 
 
 
9975064
7aaadbd
 
76275be
7aaadbd
 
76275be
 
7aaadbd
 
 
 
e8c4461
 
 
 
d89f303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76275be
d89f303
76275be
 
d89f303
4f86c96
76275be
 
 
 
 
 
 
 
 
d89f303
76275be
 
d89f303
76275be
d89f303
76275be
d89f303
76275be
 
 
d89f303
76275be
 
d89f303
76275be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f86c96
54cdb5b
 
76275be
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
import streamlit as st
import os
from streamlit_option_menu import option_menu
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from streamlit_ace import st_ace
from streamlit_pandas_profiling import st_profile_report
import pandas_profiling



def set_data_files_session_object(file_name, file_path):
    if 'data_files' not in st.session_state:
        files_dictionary = {}
        files_dictionary[file_name] = file_path
        st.session_state['data_files'] = files_dictionary
    else:
        files_dictionary = st.session_state['data_files']
        files_dictionary[file_name] = file_path
        st.session_state['data_files'] = files_dictionary

def set_filtered_data_session_object(df, file_name):
    if 'filtered_data' not in st.session_state:
        filtered_data_dictionary = {}
        filtered_data_dictionary[file_name] = df
        st.session_state['filtered_data'] = filtered_data_dictionary
    else:
        filtered_data_dictionary = st.session_state['filtered_data']
        filtered_data_dictionary[file_name] = df
        st.session_state['filtered_data'] = filtered_data_dictionary

def set_dataframe_session_object(file_name, file_path):
    if 'data_frames' not in st.session_state:
        data_frame_dictionary = {}
        data_frame_dictionary[file_name] = pd.read_csv(file_path)
        st.session_state['data_frames'] = data_frame_dictionary
    else:
        data_frame_dictionary = st.session_state['data_frames']
        data_frame_dictionary[file_name] = pd.read_csv(file_path)
        st.session_state['data_frames'] = data_frame_dictionary
        
def save_file(file_object):
    file_path = os.path.join(os.getcwd(), "uploaded_files", file_object.name)
    with open(file_path, "wb") as f:
        f.write(file_object.getbuffer())
    
    set_data_files_session_object(file_object.name, file_path)
    set_dataframe_session_object(file_object.name, file_path)
    



def create_upload_file_component():
    uploaded_files = st.file_uploader("Upload one file at a time.", type=['csv', 'xls', 'xlsx', 'pkl', 'pdf'],
                                      accept_multiple_files=True)

    if uploaded_files:

        os.makedirs(os.path.join(os.getcwd(), "uploaded_files"), mode=0o777, exist_ok=True)
        for uploaded_file in uploaded_files:
            save_file(uploaded_file)

def create_component_to_add_target_func(selected_files, dfs, i):
    target_var_name = st.text_input("Name of the target variable",key="target_var" + str(i))
#     content = st_ace(language="python")
#     if content:
    code= "def f1(x): return str(x * 3)"
    exec(code)
    st.write(f1(3))

#         st.write(len(content.splitlines()))
#         exec(content)
#         code= "def f1(x): return str(x * 3)"
   
#         exec(code)
#         st.text(content)
#         st.write(f1(3))

def set_filtered_data(df,selected_files,i):
    action = "data_filter"
    col_to_filter = st.selectbox("Select the field to Filter on ", df.columns.values,
                                 key= action + "_col_filter_" + str(i))
    filter_operation = st.selectbox("Operation ",
                                    ['Greater Than', 'Equals', 'Less Than', "In", "In Between"],
                                    key=action + "_col_filter_op_" + str(i))
    selected_filter_vals = None
    
    if filter_operation:
        if filter_operation == 'In':
            selected_filter_vals = st.multiselect("Select Values to Filter on ", df[col_to_filter].unique(),
                                                  key=action + "_col_filter_val_" + str(i))
            if selected_filter_vals:
                filtered_df = df[df[col_to_filter].isin(selected_filter_vals)]
        elif filter_operation == 'Equals':
            selected_filter_vals = st.text_input("Enter a numeric value",
                                                 key=action + "_col_filter_val_" + str(i))
            if selected_filter_vals:
                filtered_df = df[df[col_to_filter] == selected_filter_vals]
        elif filter_operation == 'Greater Than':
            selected_filter_vals = st.text_input("Enter a numeric value",
                                                 key=action + "_col_filter_val_" + str(i))
            if selected_filter_vals:
                filtered_df = df[df[col_to_filter] > selected_filter_vals]
        elif filter_operation == 'Less Than':
            selected_filter_vals = st.text_input("Enter a numeric value",
                                                 key=action + "_col_filter_val_" + str(i))
            if selected_filter_vals:
                filtered_df = df[df[col_to_filter] < selected_filter_vals]
        elif filter_operation == 'In Between':
            selected_filter_vals = st.select_slider("Select range",
                                                    (df[col_to_filter].min(), df[col_to_filter].max()),
                                                    key=action + "_col_filter_val_" + str(i))
            if selected_filter_vals:
                filtered_df = df[df[col_to_filter] < selected_filter_vals]
        
        if selected_filter_vals:
            set_filtered_data_session_object(filtered_df,selected_files[i])
            st.write('data filtered',st.session_state['filtered_data'][selected_files[i]].shape)
#                 st.write(df.shape)
#                 st.write( st.session_state['filtered_data'][selected_files[i]].shape)


def create_component_for_analysis_for_single_df(selected_files, dfs, i):
    st.subheader(selected_files[i])
    df = dfs[selected_files[i]]

    filter_data = st.checkbox("Analyse on Filtered Data",key="filter_data_check"+str(i))
    
    if filter_data:
        set_filtered_data(df,selected_files,i)

    analysis_actions = st.multiselect("What analysis do you wish to do?",
                                      ['Summary of Data', 'Sample Data','Get Profile' ,'Univariate Analysis',
                                       'Bivariate Analysis'], key='analysis_action_' + str(i))
    if analysis_actions:
        
        df_for_analysis = st.session_state['filtered_data'][selected_files[i]] if filter_data else df
                                                                                          
        for action in analysis_actions:

            if action == 'Sample Data':
                clear_chart_type_session_var()
                st.write(df_for_analysis.sample(10))
            elif action == 'Get Profile':
                clear_chart_type_session_var()
                full_data_check = st.checkbox("Report on all columns",key="filter_data_check"+str(i))

                if full_data_check:
                    st.warning("This might take a lot of time to generate the report depending on the size of the data.Select a subset of columns")
                    confirm_full_run = st.button("Run on full data")
                    if confirm_full_run:
                        pr = df_for_analysis.profile_report()
                        st_profile_report(pr)
                else:
                    col_subset = st.multiselect("Select subset of columns", df.columns.values,key='filter_subset_'+ str(i))

                    if col_subset: 
                        pr = df_for_analysis[col_subset].profile_report()
                        st_profile_report(pr)
                
            elif action == 'Summary of Data':
                clear_chart_type_session_var()
                st.write(df_for_analysis.describe())
                
            elif action == 'Univariate Analysis':
                clear_chart_type_session_var()
                cols_for_analysis = st.multiselect("Select Columns for Univariate Analysis",options= df_for_analysis.columns.values)
                for col in cols_for_analysis:
                    if str(df_for_analysis[col].dtype) in ['int64','float64'] and df_for_analysis[col].nunique() > 10 :
                        
                        fig = px.scatter(x=df_for_analysis.index, y=df_for_analysis[col],labels=dict(x="Index", y=col))
                        st.plotly_chart(fig, use_container_width=True)

                    elif str(df_for_analysis[col].dtype) in ['object','category'] or df_for_analysis[col].nunique() <= 10:
                        
                        value_dist_df = df_for_analysis[col].value_counts(normalize=True)[:20].reset_index()
                        value_dist_df.columns = [col,'% Distribution']
                        
                        value_dist_df_counts = df_for_analysis[col].value_counts()[:20].reset_index()
                        value_dist_df_counts.columns = [col,'Count']
                        value_dist_df = value_dist_df.merge(value_dist_df_counts,on=col)
                        
                        trace1 = go.Bar(x=value_dist_df[col],y=value_dist_df['Count'],name='Count',marker=dict(color='rgb(34,163,192)'))
                        trace2 = go.Scatter(x=value_dist_df[col],y=value_dist_df['% Distribution'],name='% Distribution',yaxis='y2')

                        fig = make_subplots(specs=[[{"secondary_y": True}]])
                        fig.add_trace(trace1)
                        fig.add_trace(trace2,secondary_y=True)
                        
                        fig['layout'].update(height = 600, width = 800, title = f"{col} data distribution",xaxis=dict(tickangle=-90))
                        
#                         fig.update_layout(height=200, width=400, title_text=f"{col} data distribution")
                        
                        st.plotly_chart(fig, use_container_width=True)
            elif action == "Bivariate Analysis":
                add_chart_options_to_sidebar()
                create_for_bivariate_analysis(selected_files, df, i)                        

def clear_chart_type_session_var():
    if 'chart_type' in st.session_state:
        del st.session_state[chart_type]
    
def add_chart_options_to_sidebar():
    if 'chart_type' not in st.session_state :
        with st.sidebar:
            viz_type = st.radio("Graph Type",('None','Cross Tab','Pivot Table','Box Plot'))
        if viz_type and viz_type != 'None':
            st.session_state['chart_type'] == viz_type
    
def create_for_bivariate_analysis(selected_files, df, i):
        
    target_column = st.selectbox("Select the target column ", df.columns.values,
                                 key= "bivariate_target_column_" + str(i))
    bivariate_columns = st.multiselect("Select the columns to analyse ", df.columns.values,
                                 key= "bivariate_analysis_columns_" + str(i))
    
    col_vals = []
    
    if bivariate_columns:
        for col in bivariate_columns:
            col_vals.append(df[col])
            
        if st.session_state['chart_type'] == 'Cross Tab':
            if len(col_vals) > 3 :
                st.warning("Too many columns to split on. Please consider reducing the no of columns")
            crosstab_df = pd.crosstab(df[target_column], col_vals, margins=True)
            st.write(crosstab_df.to_html(),unsafe_allow_html=True)
# 3 any other aggregation function can be used based on column type
        
    
    
def create_component_for_data_analysis():
    if 'data_files' in st.session_state:

        selected_files = st.multiselect("Select the File(S) to analyze", st.session_state['data_files'].keys())

        if selected_files:
            cols = st.columns(len(selected_files))

            dfs = {}

            for selected_file in selected_files:
                if selected_file in  st.session_state['data_frames']:
                    dfs[selected_file] =  st.session_state['data_frames'][selected_file]
                else:
                    st.session_state['data_frames'][selected_file] = pd.read_csv(st.session_state['data_files'][selected_file])
                    dfs[selected_file] = st.session_state['data_frames'][selected_file]

            for i, col in enumerate(cols):
                with col:
                    create_component_for_analysis_for_single_df(selected_files, dfs, i)

    else:
        st.write("Upload a file to start analysis")


def main():
    
    st.title("Model Results Analyzer")
    with st.sidebar:

        selected_menu = option_menu(None, ["Home", "Upload Data", "Add Features","Analyze Data","Iframe"],
                                    icons=['house', 'cloud-upload', "list-task", 'gear'],
                                    menu_icon="cast", default_index=0, orientation="vertical",
                                    styles={
                                        "container": {"padding": "0!important", "background-color": "#fafafa"},
                                        "icon": {"color": "orange", "font-size": "15px"},
                                        "nav-link": {"font-size": "15px", "text-align": "left", "margin": "0px",
                                                     "--hover-color": "#eee"},
                                        "nav-link-selected": {"background-color": "green"},
                                    })

    if selected_menu == "Home":
        st.markdown('**This is to analyse models performance.**')

    elif selected_menu == "Upload Data":

        create_upload_file_component()

        if 'data_files' in st.session_state:
            st.write(pd.DataFrame(
                data={"File Name": pd.DataFrame.from_dict(st.session_state['data_files'], orient='index').index}))

    elif selected_menu == "Analyze Data":
        create_component_for_data_analysis()

    elif selected_menu == "Add Features":
        if 'data_files' in st.session_state:
            selected_file = st.selectbox("Select the File(S) to analyze", st.session_state['data_files'].keys())

            if selected_file:
                df =  st.session_state['data_frames'][selected_file]
                st.header("Enter the function definiton to create a new feature")
                feature_name = st.text_input("Enter the New Feature Name")
                st.warning("please retain the function signature as 'add_feature(row)'")

                content = st_ace(language="python",value="def add_feature(row):")

                if content != 'def add_feature(row):':
                    exec(content)
                    df[feature_name] = df.apply(lambda x:add_feature(x),axis=1)

                    st.session_state['data_frames'][selected_file] = df
                    st.write(df.columns.values)
    elif selected_menu == "Iframe":
        # st.components.v1.iframe("https://huggingface.co/spaces/Sasidhar/information-extraction-demo", width=None, height=None, scrolling=False)
        st.components.v1.iframe("https://docs.streamlit.io/en/latest", width=None, height=None, scrolling=False)

main()