import pandas as pd |
import numpy as np |
from sklearn import metrics |
from sklearn.model_selection import train_test_split |
from sklearn import preprocessing |
from sklearn.pipeline import Pipeline |
from sklearn.impute import SimpleImputer |
from sklearn.pipeline import Pipeline |
from sklearn import compose |
from zindi_challenge.scripts.config import * |
from sklearn import linear_model |
from sklearn import svm |
from sklearn import ensemble |
def data_gathering(): |
"""This function will load and aggregate all the data then will output a pandas DataFrame |
""" |
list_of_dataframes = [] |
list_of_csv_filepaths = [] |
for fp in list_of_csv_filepaths: |
df = pd.read_csv(fp) |
list_of_dataframes.append(df) |
global_df = pd.concat(list_of_dataframes, axis=0) |
return global_df |
def remove_rows_with_Nan(data): |
""" |
""" |
return data[ ~data.isna().any(axis=1) ] |
def processing(dataset:pd.DataFrame = data_gathering()): |
"""This function will process the dataset then return a dataset ready-for-modelling |
""" |
numerical_cols = dataset.select_dtypes(include=np.number).columns.tolist() |
categorical_cols = dataset.select_dtypes(exclude=np.number).columns.tolist() |
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), |
('drop_attributes', AttributeDeleter()), |
('std_scaler', preprocessing.StandardScaler()), |
]) |
full_pipeline = compose.ColumnTransformer([('num', num_pipeline, numerical_attributes), |
('cat', OneHotEncoder(), categorical_attributes), |
]) |
train = full_pipeline.fit_transform(dataset) |
columns_to_keep = [] |
df_dataset_processed = None |
return df_dataset_processed[columns_to_keep] |
def modelling(y_col): |
"""This function will take training data as input then return train models |
""" |
df_dataset_processed = processing() |
X, y = df_dataset_processed.drop(columns=y_col) ,df_dataset_processed[y_col] |
model = linear_model.LogisticRegression().fit(X, y) |
return model |