Lim0011's picture
Upload 251 files
85e3d20 verified
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from public_timeseries_testing_util import MockApi
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold, GroupKFold, cross_val_score
from sklearn.utils import check_consistent_length
# Define the metric
def smapep1(y_true, y_pred):
"""SMAPE of y+1, a nonnegative float, smaller is better
Parameters: y_true, y_pred: array-like
Returns 100 for 100 % error.
y_true may have missing values.
"""
check_consistent_length(y_true, y_pred)
y_true = np.array(y_true, copy=False).ravel()
y_pred = np.array(y_pred, copy=False).ravel()
y_true, y_pred = y_true[np.isfinite(y_true)], y_pred[np.isfinite(y_true)]
if (y_true < 0).any(): raise ValueError('y_true < 0')
if (y_pred < 0).any(): raise ValueError('y_pred < 0')
denominator = (y_true + y_pred) / 2 + 1
ape = np.abs(y_pred - y_true) / denominator
return np.average(ape) * 100
# The scorer returns nonpositive values so that greater is better.
# It will be used as an argument to cross_val_score
smapep1_scorer = make_scorer(smapep1, greater_is_better=False)
def get_predictions(my_train, model):
# Forecast
my_train = my_train.fillna(0)
result = pd.DataFrame(columns = ['prediction_id', 'rating'])
final = []
target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]
for u in target:
# Predict
X = my_train["visit_month"]
predict = model[u].predict(X.values.reshape(-1, 1)).tolist()
complete_result = my_train[["visit_id",'visit_month']].values.tolist()
for index in range(len(complete_result)):
complete_result[index].extend(predict[index])
temp = pd.DataFrame(complete_result,
columns = ["visit_id",'visit_month',u +'_plus_0_months',
u +'_plus_6_months',
u +'_plus_12_months',
u +'_plus_24_months'])
temp = temp.melt( id_vars=["visit_id",'visit_month'],
value_vars=[ u +'_plus_0_months' , u +'_plus_6_months',
u +'_plus_12_months',u +"_plus_24_months"],
value_name = 'rating')
temp['prediction_id'] = temp['visit_id'] + '_' + temp['variable']
final.append(temp[['prediction_id','rating']])
final = pd.concat(final)
final = final.drop_duplicates(subset=['prediction_id', 'rating'])
return final
if __name__ == "__main__":
target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]
data_proteins = pd.read_csv('train_proteins.csv')
data_clinical = pd.read_csv('train_clinical_data.csv')
data_peptides = pd.read_csv('train_peptides.csv')
data_supplemental = pd.read_csv('supplemental_clinical_data.csv')
merged_data = pd.concat([data_clinical, data_supplemental])
## TODO: data cleaning and feature engineering
# Right now, we only use the month data and the target data
id_list = merged_data['patient_id'].unique().tolist()
data_for_train = {}
for u in target:
final = []
for id_ in id_list:
infor_of_id = merged_data[merged_data['patient_id'] == id_]
month_per_id = infor_of_id.visit_month.tolist()
for month in month_per_id:
check = [month, id_]
for plus in [0,6,12,24]:
if month + plus in month_per_id :
month_value = infor_of_id[infor_of_id.visit_month == month+plus][u].values[0]
if month_value != np.nan:
check.append(month_value)
if len(check) == 6:
final.append(check)
check = pd.DataFrame(final,columns = ['month', 'patient_id',u+'+0',u+'+6',u+'+12',u+'+24'])
data_for_train[u] = check.dropna()
## train model
model = {}
overall_score = []
target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]
for i, u in enumerate(target):
# Train data
X = data_for_train[u]['month']
y = data_for_train[u].iloc[:,2:6]
trained = RandomForestRegressor().fit(X.values.reshape(-1, 1), y)
# Save model
model[u] = trained
## cross validation and print results
print('Cross-validation scores')
cvs = cross_val_score(RandomForestRegressor(),
X=X.values.reshape(-1, 1), y=y,
groups=data_for_train[u]['patient_id'],
scoring=smapep1_scorer,
cv=GroupKFold(n_splits=8),
error_score='raise')
print([f'updrs_{i}:'], -cvs.round(1), -cvs.mean().round(1))
overall_score.append(-cvs)
print(f'Overall cv score of the group model: {np.array(overall_score).mean():.2f}')
## save to submission.csv file for the test set by using this following API call
env = MockApi()
iter_test = env.iter_test() # an iterator which loops over the test files
# The API will deliver four dataframes in this specific order:
for iteration, (test_clinical_data, test_peptides, test_proteins, sample_submission) in enumerate(iter_test):
# TODO - make your predictions here by modifying 'rating' sample_submission dataframe
pred = get_predictions(test_clinical_data, model).round(0)
for index in sample_submission['prediction_id']:
sample_submission.loc[sample_submission['prediction_id']==index, 'rating'] = pred[pred['prediction_id']==index]['rating'].values
env.predict(sample_submission) # register your predictions