mlops / scripts /model_training.py
Emil25's picture
Upload 4 files
61f924c verified
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from scripts.download_data import download_data
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import QuantileTransformer
import pandas as pd
def calculate_metric(model):
_, test_set = download_data()
X_test, y_test = test_set.drop(columns=['cardio']), test_set['cardio']
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred, pos_label='positive')
return f1
def model_training():
train_set, _ = download_data()
X_train, y_train = train_set.drop(columns=['cardio']), train_set['cardio']
num_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo',]
cat_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
num_pipe = Pipeline([
('qt', QuantileTransformer(output_distribution="normal")),
('scaler', StandardScaler()),
('power', PowerTransformer()),
])
cat_pipe = Pipeline([
('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessors_all = ColumnTransformer(transformers=[
('num_p', num_pipe, num_columns),
('cat_p', cat_pipe, cat_columns),
])
pipe_all = Pipeline([
('preprocessors', preprocessors_all),
('model', RandomForestClassifier(n_estimators=200,
criterion = "gini",
min_samples_split=15,
max_depth=15,
oob_score=True)
)
])
pipe_all.fit(X_train, y_train)
return pipe_all, calculate_metric(pipe_all)