|
import pandas as pd |
|
import numpy as np |
|
from sklearn import metrics |
|
from sklearn.model_selection import train_test_split |
|
from sklearn import preprocessing |
|
from sklearn.pipeline import Pipeline |
|
from sklearn.impute import SimpleImputer |
|
|
|
from sklearn.pipeline import Pipeline |
|
from sklearn import compose |
|
|
|
from zindi_challenge.scripts.config import * |
|
|
|
from sklearn import linear_model |
|
from sklearn import svm |
|
from sklearn import ensemble |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def data_gathering(): |
|
"""This function will load and aggregate all the data then will output a pandas DataFrame |
|
""" |
|
list_of_dataframes = [] |
|
list_of_csv_filepaths = [] |
|
|
|
|
|
|
|
for fp in list_of_csv_filepaths: |
|
df = pd.read_csv(fp) |
|
list_of_dataframes.append(df) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
global_df = pd.concat(list_of_dataframes, axis=0) |
|
|
|
return global_df |
|
|
|
|
|
|
|
|
|
def remove_rows_with_Nan(data): |
|
""" |
|
""" |
|
return data[ ~data.isna().any(axis=1) ] |
|
|
|
|
|
|
|
def processing(dataset:pd.DataFrame = data_gathering()): |
|
"""This function will process the dataset then return a dataset ready-for-modelling |
|
""" |
|
|
|
numerical_cols = dataset.select_dtypes(include=np.number).columns.tolist() |
|
categorical_cols = dataset.select_dtypes(exclude=np.number).columns.tolist() |
|
|
|
|
|
|
|
|
|
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), |
|
('drop_attributes', AttributeDeleter()), |
|
('std_scaler', preprocessing.StandardScaler()), |
|
]) |
|
full_pipeline = compose.ColumnTransformer([('num', num_pipeline, numerical_attributes), |
|
('cat', OneHotEncoder(), categorical_attributes), |
|
]) |
|
|
|
train = full_pipeline.fit_transform(dataset) |
|
|
|
columns_to_keep = [] |
|
df_dataset_processed = None |
|
return df_dataset_processed[columns_to_keep] |
|
|
|
|
|
|
|
|
|
|
|
|
|
def modelling(y_col): |
|
"""This function will take training data as input then return train models |
|
""" |
|
|
|
df_dataset_processed = processing() |
|
|
|
|
|
X, y = df_dataset_processed.drop(columns=y_col) ,df_dataset_processed[y_col] |
|
|
|
|
|
|
|
|
|
model = linear_model.LogisticRegression().fit(X, y) |
|
return model |