Spaces:

eogreen
/

eric_green_insurance_charge_prediction

Sleeping

App Files Files Community

eogreen commited on Sep 14, 2024

Commit

9906f45

verified ·

1 Parent(s): 3399d6b

Upload 3 files

Browse files

Files changed (3) hide show

app.py +100 -0
requirements.txt +2 -0
train.py +67 -0

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import uuid
+import joblib
+import json
+import gradio as gr
+import pandas as pd
+from huggingface_hub import CommitScheduler
+from pathlib import Path
+# Configure the logging functionality
+log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"
+log_folder = log_file.parent
+repo_id = "insurance-charge-predictor-logs"
+# Create a commit scheduler
+scheduler = CommitScheduler(
+    repo_id=repo_id,
+    repo_type="dataset",
+    folder_path=log_folder,
+    path_in_repo="data",
+    every=2
+)
+# Load the saved model
+insurance_charge_predictor = joblib.load('model.joblib')
+# Define the input features
+#numeric_features = ['age', 'bmi', 'children']
+#categorical_features = ['sex', 'smoker', 'region']
+age_input = gr.Number(label="Age")
+age_input = gr.Number(label="Age")
+bmi_input = gr.Number(label="BMI")
+children_input = gr.Number(label="Children")
+# sex: ['female' 'male']
+# smoker: ['yes' 'no']
+# region: ['southwest' 'southeast' 'northwest' 'northeast']
+sex_input = gr.Dropdown(['female','male'],label='Defaulter')
+smoker_input = gr.Dropdown(['yes','no'],label='Smoker')
+region_input = gr.Dropdown(['southwest', 'southeast', 'northwest', 'northeast'],label='Region')
+model_output = gr.Label(label="charges")
+# Define the predict function which will take features, convert to dataframe and make predictions using the saved model
+# the functions runs when 'Submit' is clicked or when a API request is made
+def predict_term_deposit(age, bmi, children, sex, smoker, region):
+    sample = {
+        'Age': age,
+        'BMI': bmi,
+        'Children': children,
+        'Sex': sex,
+        'Smoker': smoker,
+        'Region': region
+    }
+    data_point = pd.DataFrame([sample])
+    prediction = insurance_charge_predictor.predict(data_point).tolist()
+    with scheduler.lock:
+        with log_file.open("a") as f:
+            f.write(json.dumps(
+                {
+                    'Age': age,
+                    'BMI': bmi,
+                    'Children': children,
+                    'Sex': sex,
+                    'Smoker': smoker,
+                    'Region': region,
+                    'prediction': prediction[0]
+                }
+            ))
+    return prediction[0]
+gr_interface = gr.Interface(
+    fn=predict_term_deposit,
+    inputs=[age_input,
+            bmi_input,
+            children_input,
+            sex_input,
+            smoker_input,
+            region_input],
+    outputs=model_output,
+    title="HealthyLife Insurance Charge Prediction",
+    description="This API allows you to predict insurance charges based on the input features.",
+    allow_flagging="auto",
+    concurrency_limit=8
+)
+gr_interface.queue()
+gr_interface.launch(share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ scikit-learn==1.2.2
2	+ numpy==1.21.2

train.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from warnings import filterwarnings
+filterwarnings('ignore')
+import pandas as pd
+import joblib
+from sklearn.datasets import fetch_openml
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import make_column_transformer
+from sklearn.pipeline import make_pipeline
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error, r2_score
+# Read data
+data_df = pd.read_csv('insurance.csv')
+data_df = data_df.drop(columns='index')
+target = 'charges'
+numeric_features = ['age', 'bmi', 'children']
+categorical_features = ['sex', 'smoker', 'region']
+print("Creating data subsets...")
+# Split the data into features and target
+X = data_df.drop(target, axis=1)
+y = data_df[target]
+print('Splitting data into train and test...')
+# Split the independent and dependent features into x and y variables with a test size 0.2% and random at 42
+Xtrain, Xtest, ytrain, ytest = train_test_split(
+    X, y,
+    test_size=0.2,
+    random_state=42
+)
+print("Creating model pipeline...")
+# Features to scale and encode
+preprocessor = make_column_transformer(
+    (StandardScaler(), numeric_features),
+    (OneHotEncoder(handle_unknown='ignore'), categorical_features)
+)
+model_linear_regression = LinearRegression(n_jobs=-1)
+model_pipeline = make_pipeline(
+    preprocessor,
+    model_linear_regression
+)
+print("Estimating Model Pipeline...")
+model_pipeline.fit(Xtrain, ytrain)
+print('Model evaluation:')
+# print RMSE
+print(f"  RMSE: {mean_squared_error(ytest, model_pipeline.predict(Xtest), squared=False)}")
+# print R2 score
+print(f"  R2: {r2_score(ytest, model_pipeline.predict(Xtest))}")
+# Serialize the model
+print("Serializing Model...")
+saved_model_path = "model.joblib"
+joblib.dump(model_pipeline, saved_model_path)
+print('done!')