eogreen commited on
Commit
9906f45
·
verified ·
1 Parent(s): 3399d6b

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +100 -0
  2. requirements.txt +2 -0
  3. train.py +67 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import uuid
4
+ import joblib
5
+ import json
6
+
7
+ import gradio as gr
8
+ import pandas as pd
9
+
10
+ from huggingface_hub import CommitScheduler
11
+ from pathlib import Path
12
+
13
+ # Configure the logging functionality
14
+ log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"
15
+ log_folder = log_file.parent
16
+
17
+ repo_id = "insurance-charge-predictor-logs"
18
+
19
+ # Create a commit scheduler
20
+ scheduler = CommitScheduler(
21
+ repo_id=repo_id,
22
+ repo_type="dataset",
23
+ folder_path=log_folder,
24
+ path_in_repo="data",
25
+ every=2
26
+ )
27
+
28
+ # Load the saved model
29
+ insurance_charge_predictor = joblib.load('model.joblib')
30
+
31
+ # Define the input features
32
+
33
+ #numeric_features = ['age', 'bmi', 'children']
34
+ #categorical_features = ['sex', 'smoker', 'region']
35
+
36
+ age_input = gr.Number(label="Age")
37
+ age_input = gr.Number(label="Age")
38
+ bmi_input = gr.Number(label="BMI")
39
+ children_input = gr.Number(label="Children")
40
+
41
+ # sex: ['female' 'male']
42
+ # smoker: ['yes' 'no']
43
+ # region: ['southwest' 'southeast' 'northwest' 'northeast']
44
+
45
+ sex_input = gr.Dropdown(['female','male'],label='Defaulter')
46
+ smoker_input = gr.Dropdown(['yes','no'],label='Smoker')
47
+ region_input = gr.Dropdown(['southwest', 'southeast', 'northwest', 'northeast'],label='Region')
48
+
49
+ model_output = gr.Label(label="charges")
50
+
51
+ # Define the predict function which will take features, convert to dataframe and make predictions using the saved model
52
+ # the functions runs when 'Submit' is clicked or when a API request is made
53
+
54
+ def predict_term_deposit(age, bmi, children, sex, smoker, region):
55
+ sample = {
56
+ 'Age': age,
57
+ 'BMI': bmi,
58
+ 'Children': children,
59
+ 'Sex': sex,
60
+ 'Smoker': smoker,
61
+ 'Region': region
62
+ }
63
+
64
+ data_point = pd.DataFrame([sample])
65
+
66
+ prediction = insurance_charge_predictor.predict(data_point).tolist()
67
+
68
+ with scheduler.lock:
69
+ with log_file.open("a") as f:
70
+ f.write(json.dumps(
71
+ {
72
+ 'Age': age,
73
+ 'BMI': bmi,
74
+ 'Children': children,
75
+ 'Sex': sex,
76
+ 'Smoker': smoker,
77
+ 'Region': region,
78
+ 'prediction': prediction[0]
79
+ }
80
+ ))
81
+
82
+ return prediction[0]
83
+
84
+ gr_interface = gr.Interface(
85
+ fn=predict_term_deposit,
86
+ inputs=[age_input,
87
+ bmi_input,
88
+ children_input,
89
+ sex_input,
90
+ smoker_input,
91
+ region_input],
92
+ outputs=model_output,
93
+ title="HealthyLife Insurance Charge Prediction",
94
+ description="This API allows you to predict insurance charges based on the input features.",
95
+ allow_flagging="auto",
96
+ concurrency_limit=8
97
+ )
98
+
99
+ gr_interface.queue()
100
+ gr_interface.launch(share=False)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ scikit-learn==1.2.2
2
+ numpy==1.21.2
train.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from warnings import filterwarnings
3
+ filterwarnings('ignore')
4
+ import pandas as pd
5
+ import joblib
6
+ from sklearn.datasets import fetch_openml
7
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
8
+ from sklearn.compose import make_column_transformer
9
+ from sklearn.pipeline import make_pipeline
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.linear_model import LinearRegression
12
+ from sklearn.metrics import mean_squared_error, r2_score
13
+
14
+ # Read data
15
+ data_df = pd.read_csv('insurance.csv')
16
+ data_df = data_df.drop(columns='index')
17
+
18
+ target = 'charges'
19
+ numeric_features = ['age', 'bmi', 'children']
20
+ categorical_features = ['sex', 'smoker', 'region']
21
+
22
+ print("Creating data subsets...")
23
+
24
+ # Split the data into features and target
25
+ X = data_df.drop(target, axis=1)
26
+ y = data_df[target]
27
+
28
+ print('Splitting data into train and test...')
29
+
30
+ # Split the independent and dependent features into x and y variables with a test size 0.2% and random at 42
31
+ Xtrain, Xtest, ytrain, ytest = train_test_split(
32
+ X, y,
33
+ test_size=0.2,
34
+ random_state=42
35
+ )
36
+
37
+ print("Creating model pipeline...")
38
+
39
+ # Features to scale and encode
40
+ preprocessor = make_column_transformer(
41
+ (StandardScaler(), numeric_features),
42
+ (OneHotEncoder(handle_unknown='ignore'), categorical_features)
43
+ )
44
+
45
+ model_linear_regression = LinearRegression(n_jobs=-1)
46
+
47
+ model_pipeline = make_pipeline(
48
+ preprocessor,
49
+ model_linear_regression
50
+ )
51
+
52
+ print("Estimating Model Pipeline...")
53
+ model_pipeline.fit(Xtrain, ytrain)
54
+
55
+ print('Model evaluation:')
56
+
57
+ # print RMSE
58
+ print(f" RMSE: {mean_squared_error(ytest, model_pipeline.predict(Xtest), squared=False)}")
59
+
60
+ # print R2 score
61
+ print(f" R2: {r2_score(ytest, model_pipeline.predict(Xtest))}")
62
+
63
+ # Serialize the model
64
+ print("Serializing Model...")
65
+ saved_model_path = "model.joblib"
66
+ joblib.dump(model_pipeline, saved_model_path)
67
+ print('done!')