Spaces:
Sleeping
Sleeping
weewoo2636
commited on
Commit
•
30452f4
1
Parent(s):
816ff37
Upload 6 files
Browse files- eda.py +182 -3
- eda_data.csv +0 -0
eda.py
CHANGED
@@ -7,6 +7,185 @@ import matplotlib.pyplot as plt
|
|
7 |
def app():
|
8 |
st.title('Exploratory Data Analysis')
|
9 |
|
10 |
-
df = pd.read_csv('
|
11 |
-
st.write('
|
12 |
-
st.write(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
def app():
|
8 |
st.title('Exploratory Data Analysis')
|
9 |
|
10 |
+
df = pd.read_csv('deployment/eda_data.csv')
|
11 |
+
st.write('Dataset Preview')
|
12 |
+
st.write(df)
|
13 |
+
|
14 |
+
st.write('How is the percentage of default payment as education level increases?')
|
15 |
+
vis_1(df)
|
16 |
+
|
17 |
+
st.write('How is the contribution of each gender to default payment?')
|
18 |
+
vis_2(df)
|
19 |
+
|
20 |
+
st.write('Which one got more into default payment, customers with limit balance above or below average?')
|
21 |
+
vis_3(df)
|
22 |
+
|
23 |
+
st.write('How does the average of default payment changes as the total late payment rises?')
|
24 |
+
vis_4(df)
|
25 |
+
|
26 |
+
st.write('How is the contribution of each marital status to default payment?')
|
27 |
+
vis_5(df)
|
28 |
+
|
29 |
+
|
30 |
+
def vis_1(df):
|
31 |
+
# make a copy of df
|
32 |
+
df_eda_1 = df.copy()
|
33 |
+
|
34 |
+
# get default payment sum of each education level
|
35 |
+
df_eda_1_grouped_1 = pd.DataFrame(df_eda_1.groupby(by='education_level')['default_payment_next_month'].sum()).reset_index()
|
36 |
+
|
37 |
+
# get row count of each education level
|
38 |
+
df_eda_1_grouped_2 = pd.DataFrame(df_eda_1.groupby(by='education_level')['default_payment_next_month'].count()).reset_index()
|
39 |
+
df_eda_1_grouped_2.rename(columns={'default_payment_next_month' : 'amount'}, inplace=True)
|
40 |
+
|
41 |
+
# get default payment percentage of each educatin level
|
42 |
+
dp_percentage_data = []
|
43 |
+
|
44 |
+
for i in range(len(df_eda_1_grouped_1)):
|
45 |
+
dp_percentage = df_eda_1_grouped_1['default_payment_next_month'][i] / df_eda_1_grouped_2['amount'][i]
|
46 |
+
dp_percentage *= 100
|
47 |
+
dp_percentage = round(dp_percentage, 2)
|
48 |
+
dp_percentage_data.append(dp_percentage)
|
49 |
+
|
50 |
+
# create dataframe with education level and default payment percentage data
|
51 |
+
df_eda_1_final = pd.DataFrame({
|
52 |
+
'education_level' : df_eda_1_grouped_1['education_level'],
|
53 |
+
'default_payment_percentage' : dp_percentage_data
|
54 |
+
})
|
55 |
+
|
56 |
+
# set plot's title
|
57 |
+
plt.title('Default Payment Percentage for each Education Level')
|
58 |
+
|
59 |
+
# define plot
|
60 |
+
plt.bar(data=df_eda_1_final, x='education_level', height='default_payment_percentage')
|
61 |
+
|
62 |
+
# set y limit
|
63 |
+
ax = plt.gca()
|
64 |
+
ax.set_ylim([0, 100])
|
65 |
+
|
66 |
+
# add axis label
|
67 |
+
ax.set_xlabel('education level')
|
68 |
+
ax.set_ylabel('default payment percentage')
|
69 |
+
|
70 |
+
# edit x ticks and their labels
|
71 |
+
ax.set_xticks([1, 2, 3, 4])
|
72 |
+
ax.set_xticklabels(['graduate_school', 'university', 'highschool', 'others'])
|
73 |
+
|
74 |
+
# add bar label
|
75 |
+
rects = ax.patches
|
76 |
+
for rect in rects:
|
77 |
+
height = rect.get_height()
|
78 |
+
ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
|
79 |
+
f'{height}%',
|
80 |
+
ha='center', va='bottom')
|
81 |
+
|
82 |
+
# show plot
|
83 |
+
st.pyplot(plt.gcf())
|
84 |
+
plt.clf()
|
85 |
+
|
86 |
+
def vis_2(df):
|
87 |
+
# make a copy of df
|
88 |
+
df_eda_2 = df.copy()
|
89 |
+
|
90 |
+
# group df by sex and sum their default_payment
|
91 |
+
df_eda_2_grouped = pd.DataFrame(df_eda_2.groupby(by='sex')['default_payment_next_month'].sum()).reset_index()
|
92 |
+
|
93 |
+
# set plot's title
|
94 |
+
plt.title('Gender Contribution to Default Payment')
|
95 |
+
|
96 |
+
# define plot
|
97 |
+
plt.pie(data=df_eda_2_grouped, x='default_payment_next_month', labels=['male', 'female'], autopct='%1.1f%%')
|
98 |
+
|
99 |
+
# show plot
|
100 |
+
st.pyplot(plt.gcf())
|
101 |
+
plt.clf()
|
102 |
+
|
103 |
+
def vis_3(df):
|
104 |
+
# make a copy of df
|
105 |
+
df_eda_3 = df.copy()
|
106 |
+
|
107 |
+
# get the average limit_balance
|
108 |
+
avg_lim = df_eda_3['limit_balance'].mean()
|
109 |
+
|
110 |
+
# get the limit group data, below or above average
|
111 |
+
lim_group = []
|
112 |
+
for lim in df_eda_3['limit_balance']:
|
113 |
+
if lim < avg_lim:
|
114 |
+
lim_group.append('below_average')
|
115 |
+
else:
|
116 |
+
lim_group.append('above_average')
|
117 |
+
|
118 |
+
# add column
|
119 |
+
df_eda_3['limit_group'] = lim_group
|
120 |
+
|
121 |
+
# group df by limit group and sum their default payment count
|
122 |
+
df_eda_3_grouped = pd.DataFrame(df_eda_3.groupby(by='limit_group')['default_payment_next_month'].sum()).reset_index()
|
123 |
+
|
124 |
+
# set plot's title
|
125 |
+
plt.title('Default Payment Amount Categorized by Limit Group')
|
126 |
+
|
127 |
+
# define plot
|
128 |
+
barh = plt.barh(data=df_eda_3_grouped, y='limit_group', width='default_payment_next_month', label='default_payment_next_month')
|
129 |
+
|
130 |
+
# add axis label
|
131 |
+
ax = plt.gca()
|
132 |
+
ax.set_xlabel('default payment amount')
|
133 |
+
ax.set_ylabel('limit balance group')
|
134 |
+
|
135 |
+
# set x limit
|
136 |
+
ax.set_xlim([0, 550])
|
137 |
+
|
138 |
+
# add label
|
139 |
+
ax = plt.gca()
|
140 |
+
ax.bar_label(barh)
|
141 |
+
|
142 |
+
# show plot
|
143 |
+
st.pyplot(plt.gcf())
|
144 |
+
plt.clf()
|
145 |
+
|
146 |
+
def vis_4(df):
|
147 |
+
# make a copy of df
|
148 |
+
df_eda_4 = df.copy()
|
149 |
+
|
150 |
+
# get total late payment
|
151 |
+
total_late_payment = pd.Series()
|
152 |
+
for i in range(1, 7):
|
153 |
+
total_late_payment = total_late_payment.add(df_eda_4[f'pay_{i}'], fill_value=0)
|
154 |
+
|
155 |
+
# add column to df
|
156 |
+
df_eda_4['total_late_payment'] = total_late_payment
|
157 |
+
|
158 |
+
# group df by total late payment and sum default payment amount
|
159 |
+
df_eda_4_grouped = pd.DataFrame(df_eda_4.groupby(by='total_late_payment')['default_payment_next_month'].mean()).reset_index()
|
160 |
+
|
161 |
+
# set plot's title
|
162 |
+
plt.title('The Effect of Late Payment to Default Payment')
|
163 |
+
|
164 |
+
# define plot
|
165 |
+
plt.plot(df_eda_4_grouped['total_late_payment'], df_eda_4_grouped['default_payment_next_month'])
|
166 |
+
|
167 |
+
# add axis label
|
168 |
+
ax = plt.gca()
|
169 |
+
ax.set_xlabel('total late payment (month)')
|
170 |
+
ax.set_ylabel('average default payment')
|
171 |
+
|
172 |
+
# show plot
|
173 |
+
st.pyplot(plt.gcf())
|
174 |
+
plt.clf()
|
175 |
+
|
176 |
+
def vis_5(df):
|
177 |
+
# make a copy of df
|
178 |
+
df_eda_5 = df.copy()
|
179 |
+
|
180 |
+
# group df by marital status and sum their default_payment
|
181 |
+
df_eda_5_grouped = pd.DataFrame(df_eda_5.groupby(by='marital_status')['default_payment_next_month'].sum()).reset_index()
|
182 |
+
|
183 |
+
# set plot's title
|
184 |
+
plt.title('Contribution to Default Payment by Marital Status ')
|
185 |
+
|
186 |
+
# define plot
|
187 |
+
plt.pie(data=df_eda_5_grouped, x='default_payment_next_month', labels=['others', 'married', 'single', 'divorced'], autopct='%1.1f%%')
|
188 |
+
|
189 |
+
# show plot
|
190 |
+
st.pyplot(plt.gcf())
|
191 |
+
plt.clf()
|
eda_data.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|