|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.pipeline import Pipeline |
|
from scripts.download_data import download_data |
|
from sklearn.metrics import f1_score |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.preprocessing import PowerTransformer |
|
from sklearn.preprocessing import OneHotEncoder |
|
from sklearn.pipeline import Pipeline |
|
from sklearn.compose import ColumnTransformer |
|
from sklearn.preprocessing import QuantileTransformer |
|
import pandas as pd |
|
|
|
def calculate_metric(model): |
|
_, test_set = download_data() |
|
X_test, y_test = test_set.drop(columns=['cardio']), test_set['cardio'] |
|
y_pred = model.predict(X_test) |
|
f1 = f1_score(y_test, y_pred, pos_label='positive') |
|
return f1 |
|
|
|
|
|
def model_training(): |
|
train_set, _ = download_data() |
|
X_train, y_train = train_set.drop(columns=['cardio']), train_set['cardio'] |
|
|
|
num_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo',] |
|
cat_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'] |
|
|
|
num_pipe = Pipeline([ |
|
('qt', QuantileTransformer(output_distribution="normal")), |
|
('scaler', StandardScaler()), |
|
('power', PowerTransformer()), |
|
]) |
|
|
|
cat_pipe = Pipeline([ |
|
('encoder', OneHotEncoder(handle_unknown='ignore')) |
|
]) |
|
|
|
preprocessors_all = ColumnTransformer(transformers=[ |
|
('num_p', num_pipe, num_columns), |
|
('cat_p', cat_pipe, cat_columns), |
|
]) |
|
|
|
pipe_all = Pipeline([ |
|
('preprocessors', preprocessors_all), |
|
('model', RandomForestClassifier(n_estimators=200, |
|
criterion = "gini", |
|
min_samples_split=15, |
|
max_depth=15, |
|
oob_score=True) |
|
) |
|
]) |
|
|
|
pipe_all.fit(X_train, y_train) |
|
|
|
return pipe_all, calculate_metric(pipe_all) |
|
|
|
|