Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.ensemble import RandomForestRegressor | |
from public_timeseries_testing_util import MockApi | |
from sklearn.metrics import make_scorer | |
from sklearn.model_selection import KFold, GroupKFold, cross_val_score | |
from sklearn.utils import check_consistent_length | |
# Define the metric | |
def smapep1(y_true, y_pred): | |
"""SMAPE of y+1, a nonnegative float, smaller is better | |
Parameters: y_true, y_pred: array-like | |
Returns 100 for 100 % error. | |
y_true may have missing values. | |
""" | |
check_consistent_length(y_true, y_pred) | |
y_true = np.array(y_true, copy=False).ravel() | |
y_pred = np.array(y_pred, copy=False).ravel() | |
y_true, y_pred = y_true[np.isfinite(y_true)], y_pred[np.isfinite(y_true)] | |
if (y_true < 0).any(): raise ValueError('y_true < 0') | |
if (y_pred < 0).any(): raise ValueError('y_pred < 0') | |
denominator = (y_true + y_pred) / 2 + 1 | |
ape = np.abs(y_pred - y_true) / denominator | |
return np.average(ape) * 100 | |
# The scorer returns nonpositive values so that greater is better. | |
# It will be used as an argument to cross_val_score | |
smapep1_scorer = make_scorer(smapep1, greater_is_better=False) | |
def get_predictions(my_train, model): | |
# Forecast | |
my_train = my_train.fillna(0) | |
result = pd.DataFrame(columns = ['prediction_id', 'rating']) | |
final = [] | |
target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"] | |
for u in target: | |
# Predict | |
X = my_train["visit_month"] | |
predict = model[u].predict(X.values.reshape(-1, 1)).tolist() | |
complete_result = my_train[["visit_id",'visit_month']].values.tolist() | |
for index in range(len(complete_result)): | |
complete_result[index].extend(predict[index]) | |
temp = pd.DataFrame(complete_result, | |
columns = ["visit_id",'visit_month',u +'_plus_0_months', | |
u +'_plus_6_months', | |
u +'_plus_12_months', | |
u +'_plus_24_months']) | |
temp = temp.melt( id_vars=["visit_id",'visit_month'], | |
value_vars=[ u +'_plus_0_months' , u +'_plus_6_months', | |
u +'_plus_12_months',u +"_plus_24_months"], | |
value_name = 'rating') | |
temp['prediction_id'] = temp['visit_id'] + '_' + temp['variable'] | |
final.append(temp[['prediction_id','rating']]) | |
final = pd.concat(final) | |
final = final.drop_duplicates(subset=['prediction_id', 'rating']) | |
return final | |
if __name__ == "__main__": | |
target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"] | |
data_proteins = pd.read_csv('train_proteins.csv') | |
data_clinical = pd.read_csv('train_clinical_data.csv') | |
data_peptides = pd.read_csv('train_peptides.csv') | |
data_supplemental = pd.read_csv('supplemental_clinical_data.csv') | |
merged_data = pd.concat([data_clinical, data_supplemental]) | |
## TODO: data cleaning and feature engineering | |
# Right now, we only use the month data and the target data | |
id_list = merged_data['patient_id'].unique().tolist() | |
data_for_train = {} | |
for u in target: | |
final = [] | |
for id_ in id_list: | |
infor_of_id = merged_data[merged_data['patient_id'] == id_] | |
month_per_id = infor_of_id.visit_month.tolist() | |
for month in month_per_id: | |
check = [month, id_] | |
for plus in [0,6,12,24]: | |
if month + plus in month_per_id : | |
month_value = infor_of_id[infor_of_id.visit_month == month+plus][u].values[0] | |
if month_value != np.nan: | |
check.append(month_value) | |
if len(check) == 6: | |
final.append(check) | |
check = pd.DataFrame(final,columns = ['month', 'patient_id',u+'+0',u+'+6',u+'+12',u+'+24']) | |
data_for_train[u] = check.dropna() | |
## train model | |
model = {} | |
overall_score = [] | |
target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"] | |
for i, u in enumerate(target): | |
# Train data | |
X = data_for_train[u]['month'] | |
y = data_for_train[u].iloc[:,2:6] | |
trained = RandomForestRegressor().fit(X.values.reshape(-1, 1), y) | |
# Save model | |
model[u] = trained | |
## cross validation and print results | |
print('Cross-validation scores') | |
cvs = cross_val_score(RandomForestRegressor(), | |
X=X.values.reshape(-1, 1), y=y, | |
groups=data_for_train[u]['patient_id'], | |
scoring=smapep1_scorer, | |
cv=GroupKFold(n_splits=8), | |
error_score='raise') | |
print([f'updrs_{i}:'], -cvs.round(1), -cvs.mean().round(1)) | |
overall_score.append(-cvs) | |
print(f'Overall cv score of the group model: {np.array(overall_score).mean():.2f}') | |
## save to submission.csv file for the test set by using this following API call | |
env = MockApi() | |
iter_test = env.iter_test() # an iterator which loops over the test files | |
# The API will deliver four dataframes in this specific order: | |
for iteration, (test_clinical_data, test_peptides, test_proteins, sample_submission) in enumerate(iter_test): | |
# TODO - make your predictions here by modifying 'rating' sample_submission dataframe | |
pred = get_predictions(test_clinical_data, model).round(0) | |
for index in sample_submission['prediction_id']: | |
sample_submission.loc[sample_submission['prediction_id']==index, 'rating'] = pred[pred['prediction_id']==index]['rating'].values | |
env.predict(sample_submission) # register your predictions |