Spaces:
Runtime error
Runtime error
Andika Atmanegara Putra
commited on
Commit
•
6bbca31
1
Parent(s):
4dda477
add all files
Browse files- New Text Document.txt +0 -0
- ab_model.pkl +3 -0
- app.py +10 -0
- diabetes.png +0 -0
- diabetes2.png +0 -0
- diabetes_prediction_dataset.csv +0 -0
- eda.py +125 -0
- num_cols_nsc.txt +1 -0
- num_cols_sc.txt +1 -0
- prediction.py +101 -0
- requirements.txt +9 -0
- scale_feat.pkl +3 -0
- winsoriser.pkl +3 -0
New Text Document.txt
ADDED
File without changes
|
ab_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:454753be2ee87a8b266b8a06de5f11e9b478ee727b22fb6803d3adc0ee988eb0
|
3 |
+
size 27691
|
app.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import eda
|
3 |
+
import prediction
|
4 |
+
|
5 |
+
navigation = st.sidebar.selectbox('pilih halaman: ', ('Explore', 'Prediction'))
|
6 |
+
|
7 |
+
if navigation == 'Explore':
|
8 |
+
eda.run()
|
9 |
+
else:
|
10 |
+
prediction.run()
|
diabetes.png
ADDED
diabetes2.png
ADDED
diabetes_prediction_dataset.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eda.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import seaborn as sns
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import plotly.express as px
|
6 |
+
from PIL import Image
|
7 |
+
|
8 |
+
st.set_page_config(
|
9 |
+
page_title='Diabetes Prediction',
|
10 |
+
layout='wide',
|
11 |
+
initial_sidebar_state='expanded'
|
12 |
+
)
|
13 |
+
|
14 |
+
def run():
|
15 |
+
# title
|
16 |
+
st.title('Diabetes Exploration')
|
17 |
+
st.subheader('Explore The Diabetes Metrics & Dataset')
|
18 |
+
# add pic
|
19 |
+
image = Image.open('diabetes.png')
|
20 |
+
st.image(image)
|
21 |
+
st.markdown('---')
|
22 |
+
|
23 |
+
markdown_text = '''
|
24 |
+
## Backgorund
|
25 |
+
Firstly, diabetes is a prevalent and chronic health condition that affects a significant portion of the population worldwide.
|
26 |
+
By providing a prediction model for diabetes, it can contribute to early detection and intervention, which is crucial in
|
27 |
+
managing the disease and preventing complications. Secondly, the integration of a diabetes prediction model in the web project
|
28 |
+
aims to enhance user experience and provide personalized health insights. Users can input their relevant health data, such as BMI,
|
29 |
+
blood glucose levels, and other factors, to obtain a prediction of their likelihood of having diabetes.
|
30 |
+
|
31 |
+
This information can empower individuals to make informed decisions about their health, seek appropriate medical attention
|
32 |
+
if necessary, and adopt preventive measures to reduce the risk of diabetes. Overall, the inclusion of a diabetes prediction
|
33 |
+
feature aligns with the objective of promoting health awareness and enabling users to take proactive steps towards their
|
34 |
+
well-being.
|
35 |
+
|
36 |
+
## Problem Statement
|
37 |
+
Using a dataset obtained from Kaggle, the goal is to build a predictive model that determines whether
|
38 |
+
individuals with specific characteristics are likely to have diabetes or not.
|
39 |
+
|
40 |
+
## Objective
|
41 |
+
The objectives of this project are to preprocess the dataset, explore its features, analyze the data,
|
42 |
+
implement four different algorithms for predicting the target variable, and perform Hyperparameter Tuning
|
43 |
+
to optimize the models' performance.
|
44 |
+
|
45 |
+
## About Dataset
|
46 |
+
| Variable | Description |
|
47 |
+
|-------------------------|-----------------------------------------------------------------------------------------------|
|
48 |
+
| Gender | Gender refers to the biological sex of the individual |
|
49 |
+
| Age | Age is an important factor as diabetes is more commonly diagnosed in older adults |
|
50 |
+
| hypertension | Hypertension is a medical condition in which the blood pressure in the arteries is |
|
51 |
+
| | persistently elevated (1 = True, 0 = False) |
|
52 |
+
| heart_disease | Heart disease is another medical condition that is associated with an increased risk of |
|
53 |
+
| | developing diabetes |
|
54 |
+
| smoking_history | Smoking history is also considered a risk factor for diabetes. |
|
55 |
+
| bmi | BMI (Body Mass Index) is a measure of body fat based on weight and height |
|
56 |
+
| HbA1c_level | HbA1c (Hemoglobin A1c) level is a measure of a person's average blood sugar level over the |
|
57 |
+
| | past 2-3 months |
|
58 |
+
| blood_glucose_level | Blood glucose level refers to the amount of glucose in the bloodstream at a given time |
|
59 |
+
| diabetes | Diabetes is the target variable being predicted (1 = True, 0 = False) |
|
60 |
+
|
61 |
+
'''
|
62 |
+
|
63 |
+
st.markdown(markdown_text)
|
64 |
+
st.markdown('---')
|
65 |
+
|
66 |
+
|
67 |
+
st.subheader('Data Exploratory')
|
68 |
+
st.markdown('---')
|
69 |
+
|
70 |
+
st.write('### Patient Information')
|
71 |
+
|
72 |
+
# show dataframe
|
73 |
+
data = pd.read_csv('diabetes_prediction_dataset.csv')
|
74 |
+
st.dataframe(data)
|
75 |
+
st.markdown('---')
|
76 |
+
|
77 |
+
# Distribusi Penderita Diabetes
|
78 |
+
fig, ax = plt.subplots()
|
79 |
+
plt.pie(data['diabetes'].value_counts(),
|
80 |
+
labels=['non-diabetic', 'diabetic'],
|
81 |
+
autopct='%1.1f%%',
|
82 |
+
colors=['Grey', 'red'],
|
83 |
+
startangle=25,
|
84 |
+
explode=[0.05, 0.05])
|
85 |
+
plt.title('Diabetes Distribution')
|
86 |
+
plt.axis('equal')
|
87 |
+
st.pyplot(fig)
|
88 |
+
'''
|
89 |
+
Based on the chart above, around 91.5% of the total 100,000 patients do
|
90 |
+
not suffer from diabetes and only **8.5%** of patients **do have diabetes**.
|
91 |
+
91.5% of total non-diabetic patients will be analyzed with health factors
|
92 |
+
to predict whether the patient or others can get diabetes or not
|
93 |
+
'''
|
94 |
+
st.markdown('---')
|
95 |
+
|
96 |
+
# visual barplot
|
97 |
+
st.subheader('Chart Based on User Input ')
|
98 |
+
st.markdown('---')
|
99 |
+
|
100 |
+
choice = st.selectbox('Pick Numeric Columns: ', ('age',
|
101 |
+
'heart_disease',
|
102 |
+
'bmi',
|
103 |
+
'HbA1c_level', 'blood_glucose_level'))
|
104 |
+
|
105 |
+
fig,ax = plt.subplots(figsize=(15,10))
|
106 |
+
sns.kdeplot(data[choice], fill=True)
|
107 |
+
ax.set_title(choice.capitalize()+' Ratio')
|
108 |
+
st.pyplot(fig)
|
109 |
+
st.markdown('---')
|
110 |
+
|
111 |
+
# visual 2
|
112 |
+
## Categorical Data Plot
|
113 |
+
pilihan_kategori = st.selectbox('Pick Category Column : ', ('gender','hypertension','smoking_history','diabetes'))
|
114 |
+
fig= plt.figure(figsize=(8, 6))
|
115 |
+
sns.countplot(data=data, x=pilihan_kategori, hue='diabetes', palette='Set2')
|
116 |
+
|
117 |
+
plt.xlabel(pilihan_kategori.capitalize())
|
118 |
+
plt.ylabel('Count')
|
119 |
+
plt.title(pilihan_kategori.capitalize()+' Ratio')
|
120 |
+
plt.legend(title='Diabetes')
|
121 |
+
|
122 |
+
st.pyplot(fig)
|
123 |
+
|
124 |
+
if __name__ == '__main__':
|
125 |
+
run()
|
num_cols_nsc.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["gender", "hypertension", "heart_disease"]
|
num_cols_sc.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["age", "bmi", "hemoglobin_level", "blood_glucose_level"]
|
prediction.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import pickle
|
6 |
+
import json
|
7 |
+
from PIL import Image
|
8 |
+
|
9 |
+
# load all files
|
10 |
+
|
11 |
+
with open('ab_model.pkl', 'rb') as file_1:
|
12 |
+
ab_model = pickle.load(file_1)
|
13 |
+
|
14 |
+
# Pre-processing
|
15 |
+
with open('scale_feat.pkl', 'rb') as file_2:
|
16 |
+
scale_feat = pickle.load(file_2)
|
17 |
+
|
18 |
+
with open('winsoriser.pkl', 'rb') as file_3:
|
19 |
+
winsoriser = pickle.load(file_3)
|
20 |
+
|
21 |
+
# List Numeric & Category
|
22 |
+
with open('num_cols_sc.txt', 'r') as file_4:
|
23 |
+
num_cols_sc = json.load(file_4)
|
24 |
+
|
25 |
+
with open('num_cols_nsc.txt', 'r') as file_5:
|
26 |
+
num_cols_nsc = json.load(file_5)
|
27 |
+
|
28 |
+
|
29 |
+
def run():
|
30 |
+
with st.form(key='from_diabetes'):
|
31 |
+
|
32 |
+
st.title('Prediction Page')
|
33 |
+
|
34 |
+
# sub header
|
35 |
+
st.subheader('We calculate your metrics to calculate diabetes')
|
36 |
+
|
37 |
+
# add pic
|
38 |
+
image = Image.open('diabetes2.png')
|
39 |
+
st.image(image)
|
40 |
+
st.write('Columns below are parameter we would like to use to predict if a patient have a diabetes or not.')
|
41 |
+
st.write('*`Please fill columns below to predict`*')
|
42 |
+
|
43 |
+
gender = st.selectbox('Gender', [0,1], help='0 = Female, 1 = Male')
|
44 |
+
|
45 |
+
age = st.number_input('Age', min_value=25, max_value=80,
|
46 |
+
value=45, step=1, help='Usia Pasien')
|
47 |
+
|
48 |
+
hypertension = st.number_input('Hypertension', min_value=0, max_value=1 , value=0,
|
49 |
+
step=1, help='have hypertension?')
|
50 |
+
|
51 |
+
heart_disease = st.number_input('Heart Disease', min_value=0, max_value=1 , value=0,
|
52 |
+
step=1, help='have heart disease?')
|
53 |
+
|
54 |
+
bmi = st.number_input('Body Mass Index', min_value=5, max_value=80,
|
55 |
+
value=30, step=5, help='Amount of BMI')
|
56 |
+
|
57 |
+
HbA1c_level = st.number_input('Hemogloblin Level', min_value= 3, max_value= 10,
|
58 |
+
value= 6, help='Level of Hemogloblin 3-10')
|
59 |
+
|
60 |
+
blood_glucose_level = st.slider('Glucose Level', 0, 400, 150, step=10,
|
61 |
+
help='Glucose amount in blood stream')
|
62 |
+
|
63 |
+
|
64 |
+
st.markdown('---')
|
65 |
+
submitted = st.form_submit_button('Predict')
|
66 |
+
|
67 |
+
data_inf = {
|
68 |
+
'age': age,
|
69 |
+
'bmi': bmi,
|
70 |
+
'hemoglobin_level': HbA1c_level,
|
71 |
+
'blood_glucose_level': blood_glucose_level,
|
72 |
+
'gender': gender,
|
73 |
+
'hypertension': hypertension,
|
74 |
+
'heart_disease': heart_disease,
|
75 |
+
}
|
76 |
+
|
77 |
+
data_inf = pd.DataFrame([data_inf])
|
78 |
+
st.dataframe(data_inf)
|
79 |
+
|
80 |
+
if submitted:
|
81 |
+
data_inf_sc = data_inf[num_cols_sc]
|
82 |
+
data_inf_nsc = data_inf[num_cols_nsc]
|
83 |
+
|
84 |
+
# scalling
|
85 |
+
data_inf_sc = scale_feat.transform(data_inf_sc)
|
86 |
+
data_inf_sc = pd.DataFrame(data_inf_sc, columns=num_cols_sc)
|
87 |
+
|
88 |
+
# Reset Index
|
89 |
+
data_inf_sc.reset_index(drop= True, inplace= True)
|
90 |
+
data_inf_nsc.reset_index(drop = True, inplace = True)
|
91 |
+
data_final = pd.concat([data_inf_sc, data_inf_nsc], axis= 1)
|
92 |
+
# modeling
|
93 |
+
y_pred_inf = ab_model.predict(data_final)
|
94 |
+
|
95 |
+
if y_pred_inf[0] == 1:
|
96 |
+
st.write('**`Prediction: You Have Diabetes`**')
|
97 |
+
else:
|
98 |
+
st.write('# **`Prediction: You do not Have Diabetes`**')
|
99 |
+
|
100 |
+
if __name__ == '__main__':
|
101 |
+
run()
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pandas
|
3 |
+
seaborn
|
4 |
+
matplotlib
|
5 |
+
plotly
|
6 |
+
Pillow
|
7 |
+
catboost
|
8 |
+
feature-engine
|
9 |
+
scikit-learn==1.2.2
|
scale_feat.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:24ecface8c267c8b39e51c68eee345f48bb24f6a2febfc4cafc5d8b2824fc2fa
|
3 |
+
size 783
|
winsoriser.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c663972e73c566cd5440a11e16d3c077ce5a0bea17e80054f90dc9e9fa71891e
|
3 |
+
size 452
|