|
from catboost import CatBoostClassifier, Pool |
|
from sklearn.model_selection import GridSearchCV |
|
from sklearn.model_selection import KFold |
|
from sklearn.model_selection import ParameterGrid |
|
|
|
import pyro |
|
import pyro.distributions as dist |
|
from pyro.nn import PyroModule, PyroSample |
|
from pyro.infer.autoguide import AutoDiagonalNormal |
|
from pyro.infer import SVI, Trace_ELBO, Predictive, MCMC, NUTS |
|
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor |
|
from sklearn.metrics import accuracy_score, roc_auc_score |
|
import argparse |
|
import itertools |
|
|
|
from train import train, get_weighted_single_eval_pos_sampler, Losses |
|
import priors |
|
import encoders |
|
from sklearn import preprocessing |
|
|
|
from sklearn.base import BaseEstimator, ClassifierMixin |
|
|
|
from torch import nn |
|
|
|
from datasets import * |
|
import xgboost as xgb |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
|
|
import torch |
|
from sklearn import neighbors, datasets |
|
from sklearn.gaussian_process import GaussianProcessClassifier |
|
from sklearn.gaussian_process.kernels import RBF |
|
|
|
from priors.utils import trunc_norm_sampler_f, beta_sampler_f, gamma_sampler_f, uniform_sampler_f, zipf_sampler_f, scaled_beta_sampler_f, uniform_int_sampler_f |
|
|
|
from tqdm import tqdm |
|
import time |
|
import random |
|
|
|
import os |
|
|
|
CV = 5 |
|
param_grid = {} |
|
metric_used = roc_auc_score |
|
|
|
def get_uniform_single_eval_pos_sampler(max_len): |
|
""" |
|
Just sample any evaluation position with the same weight |
|
:return: Sampler that can be fed to `train()` as `single_eval_pos_gen`. |
|
""" |
|
return lambda: random.choices(range(max_len))[0] |
|
|
|
|
|
def get_mlp_prior_hyperparameters(config): |
|
sigma_sampler = gamma_sampler_f(config["prior_sigma_gamma_k"], config["prior_sigma_gamma_theta"]) |
|
noise_std_sampler = gamma_sampler_f(config["prior_noise_std_gamma_k"], config["prior_noise_std_gamma_theta"]) |
|
|
|
mlp_prior_hyperparameters = (list(config["prior_nlayers_sampler"].values())[0] |
|
, list(config["prior_emsize_sampler"].values())[0] |
|
, config["prior_activations"] |
|
, sigma_sampler |
|
, noise_std_sampler |
|
, list(config["prior_dropout_sampler"].values())[0] |
|
, True |
|
, list(config["prior_num_features_used_sampler"].values())[0] |
|
, list(config["prior_causes_sampler"].values())[0] if config['prior_is_causal'] else None |
|
, config["prior_is_causal"] |
|
, config["prior_pre_sample_causes"] if config['prior_is_causal'] else None |
|
, config["prior_pre_sample_weights"] if config['prior_is_causal'] else None |
|
, config["prior_y_is_effect"] if config['prior_is_causal'] else None |
|
, config["prior_order_y"] |
|
, config["prior_normalize_by_used_features"] |
|
, list(config["prior_categorical_feats"].values())[0] if config['prior_is_causal'] else None |
|
, 0.0 |
|
) |
|
|
|
return mlp_prior_hyperparameters |
|
|
|
|
|
def get_gp_mix_prior_hyperparameters(config): |
|
return {'lengthscale_concentration': config["prior_lengthscale_concentration"], |
|
'nu': config["prior_nu"], |
|
'outputscale_concentration': config["prior_outputscale_concentration"], |
|
'categorical_data': config["prior_y_minmax_norm"], |
|
'y_minmax_norm': config["prior_lengthscale_concentration"], |
|
'noise_concentration': config["prior_noise_concentration"], |
|
'noise_rate': config["prior_noise_rate"]} |
|
|
|
|
|
def get_gp_prior_hyperparameters(config): |
|
|
|
|
|
return (config['prior_noise'] |
|
, lambda : config['prior_outputscale'] |
|
, lambda : config['prior_lengthscale'] |
|
, True |
|
, list(config['prior_num_features_used_sampler'].values())[0] |
|
, config['prior_normalize_by_used_features'] |
|
, config['prior_order_y']) |
|
|
|
|
|
def get_meta_gp_prior_hyperparameters(config): |
|
lengthscale_sampler = trunc_norm_sampler_f(config["prior_lengthscale_mean"], config["prior_lengthscale_mean"] * config["prior_lengthscale_std_f"]) |
|
outputscale_sampler = trunc_norm_sampler_f(config["prior_outputscale_mean"], config["prior_outputscale_mean"] * config["prior_outputscale_std_f"]) |
|
|
|
return (config['prior_noise'] |
|
, outputscale_sampler |
|
, lengthscale_sampler |
|
, True |
|
, list(config['prior_num_features_used_sampler'].values())[0] |
|
, config['prior_normalize_by_used_features'] |
|
, config['prior_order_y']) |
|
|
|
|
|
|
|
def get_model(config, device, eval_positions, should_train=True, verbose=False): |
|
extra_kwargs = {} |
|
if config['prior_type'] == 'mlp': |
|
prior_hyperparameters = get_mlp_prior_hyperparameters(config) |
|
model_proto = priors.mlp.DataLoader |
|
extra_kwargs['batch_size_per_gp_sample'] = 8 |
|
elif config['prior_type'] == 'gp': |
|
prior_hyperparameters = get_gp_prior_hyperparameters(config) |
|
model_proto = priors.fast_gp.DataLoader |
|
elif config['prior_type'] == 'custom_gp_mix': |
|
prior_hyperparameters = get_meta_gp_prior_hyperparameters(config) |
|
model_proto = priors.fast_gp.DataLoader |
|
elif config['prior_type'] == 'gp_mix': |
|
prior_hyperparameters = get_gp_mix_prior_hyperparameters(config) |
|
model_proto = priors.fast_gp_mix.DataLoader |
|
else: |
|
raise Exception() |
|
|
|
epochs = 0 if not should_train else config['epochs'] |
|
model = train(model_proto |
|
, Losses.bce |
|
, encoders.Linear |
|
, emsize=config['emsize'] |
|
, nhead=config['nhead'] |
|
, y_encoder_generator=encoders.Linear |
|
, pos_encoder_generator=None |
|
, batch_size=config['batch_size'] |
|
, nlayers=config['nlayers'] |
|
, nhid=config['emsize'] * config['nhid_factor'] |
|
, epochs=epochs |
|
, warmup_epochs=epochs // 4 |
|
, bptt=config['bptt'] |
|
, gpu_device=device |
|
, dropout=config['dropout'] |
|
, steps_per_epoch=100 |
|
, single_eval_pos_gen=get_uniform_single_eval_pos_sampler(max(eval_positions) + 1) |
|
, extra_prior_kwargs_dict={ |
|
'num_features': config['num_features'] |
|
|
|
, 'fuse_x_y': False |
|
, 'hyperparameters': prior_hyperparameters |
|
, **extra_kwargs |
|
} |
|
, lr=config['lr'] |
|
, verbose=verbose) |
|
|
|
return model |
|
|
|
|
|
|
|
|
|
def evaluate(datasets, model, method, bptt, eval_position_range, device, max_features=0, plot=False, extend_features=False, save=True, rescale_features=False, overwrite=False, |
|
max_samples=40, path_interfix=''): |
|
|
|
result = {'metric': 'auc'} |
|
|
|
metric_sum = 0 |
|
for [name, X, y, categorical_feats] in datasets: |
|
result_ds = {} |
|
path = f'/home/hollmann/prior-fitting/results/tabular/{path_interfix}/results_{method}_{name}.npy' |
|
if (os.path.isfile(path)) and not overwrite: |
|
with open(path, 'rb') as f: |
|
result_ds = np.load(f, allow_pickle=True).tolist() |
|
if 'time' in result_ds: |
|
result_ds[name+'_time'] = result_ds['time'] |
|
del result_ds['time'] |
|
result.update(result_ds) |
|
mean_metric = float(result[name + '_mean_metric_at_' + str(eval_position_range[-1])]) |
|
metric_sum += mean_metric |
|
print(f'Loaded saved result for {name} (mean metric {mean_metric})') |
|
continue |
|
|
|
print('Evaluating ' + str(name)) |
|
rescale_features_factor = X.shape[1] / max_features if rescale_features and extend_features else 1.0 |
|
if extend_features: |
|
X = torch.cat([X, torch.zeros((X.shape[0], max_features - X.shape[1]))], -1) |
|
|
|
start_time = time.time() |
|
ds_result = evaluate_dataset(X.to(device), y.to(device), categorical_feats, model, bptt, eval_position_range, |
|
rescale_features=rescale_features_factor, max_samples=max_samples) |
|
elapsed = time.time() - start_time |
|
|
|
for i, r in enumerate(ds_result): |
|
metric, outputs, ys = r |
|
if save: |
|
result_ds[name + '_per_ds_metric_at_' + str(eval_position_range[i])] = metric |
|
result_ds[name + '_outputs_at_' + str(eval_position_range[i])] = outputs |
|
result_ds[name + '_ys_at_' + str(eval_position_range[i])] = ys |
|
|
|
result_ds[name + '_mean_metric_at_' + str(eval_position_range[i])] = metric_used(ys.detach().cpu().flatten(), outputs.flatten()) |
|
result_ds[name + '_time'] = elapsed |
|
|
|
if save: |
|
with open(path, 'wb') as f: |
|
np.save(f, result_ds) |
|
|
|
result.update(result_ds) |
|
metric_sum += float(metric[-1].mean()) |
|
|
|
for pos in eval_position_range: |
|
result[f'mean_metric_at_{pos}'] = np.array([result[d[0] + '_mean_metric_at_' + str(pos)] for d in datasets]).mean() |
|
|
|
result['mean_metric'] = np.array([result['mean_metric_at_' + str(pos)] for pos in eval_position_range]).mean() |
|
|
|
return result |
|
|
|
|
|
def evaluate_dataset(X, y, categorical_feats, model, bptt, eval_position_range, plot=False, rescale_features=1.0, |
|
max_samples=40): |
|
result = [] |
|
for eval_position in eval_position_range: |
|
r = evaluate_position(X, y, categorical_feats, model, bptt, eval_position, rescale_features=rescale_features, |
|
max_samples=max_samples) |
|
result.append(r) |
|
print('\t Eval position ' + str(eval_position) + ' done..') |
|
|
|
if plot: |
|
plt.plot(np.array(list(eval_position_range)), np.array([r.mean() for r in result])) |
|
|
|
return result |
|
|
|
|
|
def evaluate_position(X, y, categorical_feats, model, bptt, eval_position, rescale_features=1.0, max_samples=40): |
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_xs = [] |
|
eval_ys = [] |
|
num_evals = len(X) - bptt |
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(num_evals): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
x_ = X[i:i + bptt].clone() |
|
y_ = y[i:i + bptt].clone() |
|
|
|
eval_xs.append(x_) |
|
eval_ys.append(y_) |
|
|
|
|
|
|
|
eval_xs = torch.stack(eval_xs, 1) |
|
eval_ys = torch.stack(eval_ys, 1) |
|
|
|
|
|
with torch.random.fork_rng(): |
|
torch.random.manual_seed(13) |
|
sel = torch.randperm(eval_xs.shape[1]) |
|
eval_xs = eval_xs[:, sel[0:max_samples], :] |
|
eval_ys = eval_ys[:, sel[0:max_samples]] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if isinstance(model, nn.Module): |
|
model.eval() |
|
outputs = np.zeros(shape=(len(list(range(eval_position, eval_xs.shape[0]))), eval_xs.shape[1])) |
|
for i, pos in enumerate(range(eval_position, eval_xs.shape[0])): |
|
eval_x = torch.cat([eval_xs[:eval_position], eval_xs[pos].unsqueeze(0)]) |
|
eval_y = eval_ys[:eval_position] |
|
|
|
|
|
mean = eval_x.mean(0) |
|
std = eval_x.std(0) + .000001 |
|
eval_x = (eval_x - mean) / std |
|
eval_x = eval_x / rescale_features |
|
|
|
output = torch.sigmoid(model((eval_x, eval_y.float()), single_eval_pos=eval_position)).squeeze(-1) |
|
outputs[i, :] = output.detach().cpu().numpy() |
|
|
|
metric_per_t = np.array([metric_used(eval_ys[eval_position:, i].cpu(), outputs[:, i]) for i in range(eval_xs.shape[1])]) |
|
return metric_per_t, outputs, eval_ys[eval_position:] |
|
else: |
|
metric_eval_pos, outputs = batch_pred(model, eval_xs, eval_ys, categorical_feats, start=eval_position) |
|
|
|
return metric_eval_pos, outputs, eval_ys[eval_position:] |
|
|
|
|
|
def batch_pred(metric_function, eval_xs, eval_ys, categorical_feats, start=2): |
|
metrics = [] |
|
outputs = [] |
|
|
|
eval_splits = list(zip(eval_xs.transpose(0, 1), eval_ys.transpose(0, 1))) |
|
for eval_x, eval_y in tqdm(eval_splits): |
|
mean = eval_x[:start].mean(0) |
|
std = eval_x[:start].std(0) + .000001 |
|
eval_x = (eval_x - mean) / std |
|
|
|
metric, output = metric_function(eval_x[:start], eval_y[:start], eval_x[start:], eval_y[start:], categorical_feats) |
|
metrics += [metric] |
|
outputs += [output] |
|
|
|
return np.array(metrics), np.array(outputs).T |
|
|
|
|
|
|
|
|
|
from sklearn.linear_model import RidgeClassifier |
|
|
|
def ridge_metric(x, y, test_x, test_y, cat_features): |
|
import warnings |
|
def warn(*args, **kwargs): |
|
pass |
|
|
|
warnings.warn = warn |
|
|
|
x, y, test_x, test_y = x.cpu(), y.cpu(), test_x.cpu(), test_y.cpu() |
|
|
|
clf = RidgeClassifier() |
|
|
|
|
|
|
|
clf = GridSearchCV(clf, param_grid['ridge'], cv=min(CV, x.shape[0]//2)) |
|
|
|
clf.fit(x, y.long()) |
|
|
|
pred = clf.decision_function(test_x) |
|
metric = metric_used(test_y.cpu().numpy(), pred) |
|
|
|
return metric, pred |
|
|
|
|
|
from sklearn.linear_model import LogisticRegression |
|
param_grid['logistic'] = {'solver': ['saga'], 'penalty': ['l1', 'l2', 'none'], 'tol': [1e-2, 1e-4, 1e-10], 'max_iter': [500], 'fit_intercept': [True, False], 'C': [1e-5, 0.001, 0.01, 0.1, 1.0, 2.0]} |
|
def logistic_metric(x, y, test_x, test_y, cat_features): |
|
import warnings |
|
def warn(*args, **kwargs): |
|
pass |
|
|
|
warnings.warn = warn |
|
|
|
x, y, test_x, test_y = x.cpu(), y.cpu(), test_x.cpu(), test_y.cpu() |
|
|
|
clf = LogisticRegression() |
|
|
|
|
|
|
|
clf = GridSearchCV(clf, param_grid['logistic'], cv=min(CV, x.shape[0]//2)) |
|
|
|
clf.fit(x, y.long()) |
|
|
|
pred = clf.predict_proba(test_x)[:, 1] |
|
metric = metric_used(test_y.cpu().numpy(), pred) |
|
|
|
return metric, pred |
|
|
|
|
|
|
|
param_grid['knn'] = {'n_neighbors (max number of samples)': np.arange(1, 6)} |
|
def knn_metric(x, y, test_x, test_y, cat_features): |
|
x, y, test_x, test_y = x.cpu(), y.cpu(), test_x.cpu(), test_y.cpu() |
|
|
|
clf = neighbors.KNeighborsClassifier() |
|
param_grid_knn = {'n_neighbors': np.arange(1, min(6, len(y) - 1))} |
|
|
|
|
|
clf = GridSearchCV(clf, param_grid_knn, cv=min(CV, x.shape[0]//2)) |
|
|
|
clf.fit(x, y.long()) |
|
|
|
|
|
|
|
|
|
pred = clf.predict_proba(test_x)[:, 1] |
|
|
|
metric = metric_used(test_y.cpu().numpy(), pred) |
|
|
|
return metric, pred |
|
|
|
|
|
|
|
class BayesianModel(PyroModule): |
|
def __init__(self, model_spec, device='cuda'): |
|
super().__init__() |
|
|
|
self.device = device |
|
self.num_features = model_spec['num_features'] |
|
|
|
mu, sigma = torch.tensor([0.0]).to(self.device), torch.tensor([1.0]).to(self.device) |
|
|
|
self.fc1 = PyroModule[nn.Linear](self.num_features, model_spec['embed']) |
|
self.fc1.weight = PyroSample( |
|
dist.Normal(mu, sigma).expand([model_spec['embed'], self.num_features]).to_event(2)) |
|
self.fc1.bias = PyroSample(dist.Normal(mu, sigma).expand([model_spec['embed']]).to_event(1)) |
|
self.fc2 = PyroModule[nn.Linear](model_spec['embed'], 2) |
|
self.fc2.weight = PyroSample(dist.Normal(mu, sigma).expand([2, model_spec['embed']]).to_event(2)) |
|
self.fc2.bias = PyroSample(dist.Normal(mu, sigma).expand([2]).to_event(1)) |
|
|
|
self.model = torch.nn.Sequential(self.fc1, self.fc2) |
|
|
|
self.to(self.device) |
|
|
|
def forward(self, x=None, y=None, seq_len=1): |
|
if x is None: |
|
with pyro.plate("x_plate", seq_len): |
|
d_ = dist.Normal(torch.tensor([0.0]).to(self.device), torch.tensor([1.0]).to(self.device)).expand( |
|
[self.num_features]).to_event(1) |
|
x = pyro.sample("x", d_) |
|
|
|
out = self.model(x) |
|
mu = out.squeeze() |
|
softmax = torch.nn.Softmax(dim=1) |
|
|
|
with pyro.plate("data", out.shape[0]): |
|
|
|
|
|
s = softmax(mu) |
|
obs = pyro.sample('obs', dist.Categorical(probs=s), obs=y).float() |
|
|
|
return x, obs |
|
|
|
|
|
class BayesianNNClassifier(BaseEstimator, ClassifierMixin): |
|
|
|
def __init__(self, num_features, n_layers, embed, lr, device): |
|
self.num_pred_samples = 400 |
|
self.num_steps = 400 |
|
self.embed = embed |
|
self.n_layers = n_layers |
|
self.lr = lr |
|
self.num_features = num_features |
|
self.device = device |
|
|
|
def fit(self, X, y): |
|
model_spec = {'nlayers': 2, 'embed': self.embed, 'num_features': self.num_features} |
|
|
|
self.model = BayesianModel(model_spec, device=self.device) |
|
self.guide = AutoDiagonalNormal(self.model).to(self.device) |
|
self.adam = pyro.optim.Adam({"lr": self.lr}) |
|
self.svi = SVI(self.model, self.guide, self.adam, loss=Trace_ELBO()) |
|
|
|
pyro.clear_param_store() |
|
|
|
X = X.to(self.device) |
|
y = y.to(self.device) |
|
|
|
for epoch in tqdm(range(0, self.num_steps)): |
|
loss = self.svi.step(X, y) |
|
|
|
|
|
return self |
|
|
|
def predict(self, X): |
|
X = X.to(self.device) |
|
predictive = Predictive(self.model, guide=self.guide, num_samples=self.num_pred_samples) |
|
preds = predictive(X)['obs'] |
|
preds_means = preds.float().mean(axis=0).detach().cpu() |
|
preds_hard = preds_means > 0.5 |
|
|
|
return preds_hard.long() |
|
|
|
def predict_proba(self, X): |
|
X = X.to(self.device) |
|
predictive = Predictive(self.model, guide=self.guide, num_samples=self.num_pred_samples) |
|
preds = predictive(X)['obs'] |
|
preds_means = preds.float().mean(axis=0).detach().cpu() |
|
|
|
return preds_means |
|
|
|
def score(self, X, y): |
|
return super().score(X, y) |
|
|
|
param_grid['bayes'] = {'embed': [5, 10, 30, 64], 'lr': [1e-3, 1e-4], 'num_training_steps': [400], 'num_samples_for_prediction': [400]} |
|
def bayes_net_metric(x, y, test_x, test_y, cat_features): |
|
device = x.device |
|
|
|
clf = BayesianNNClassifier(x.shape[1], 2, 1, 1e-3, device) |
|
|
|
|
|
clf = GridSearchCV(clf, param_grid['bayes'], cv=5) |
|
|
|
clf.fit(x.cpu(), y.long().cpu()) |
|
|
|
pred = clf.predict_proba(test_x) |
|
metric = metric_used(test_y.cpu().numpy(), pred.cpu().numpy()) |
|
|
|
return metric, pred |
|
|
|
|
|
param_grid['gp'] = {'params_y_scale': [0.05, 0.1, 0.5, 1.0, 5.0, 10.0], |
|
'params_length_scale': [0.1, 0.5, 1.0, 2.0]} |
|
def gp_metric(x, y, test_x, test_y, cat_features): |
|
import warnings |
|
def warn(*args, **kwargs): |
|
pass |
|
warnings.warn = warn |
|
|
|
x, y, test_x, test_y = x.cpu(), y.cpu(), test_x.cpu(), test_y.cpu() |
|
|
|
clf = GaussianProcessClassifier() |
|
|
|
params_y_scale = [0.05, 0.1, 0.5, 1.0, 5.0, 10.0] |
|
params_length_scale = [0.1, 0.5, 1.0, 2.0] |
|
param_grid = {'kernel': [y * RBF(l) for (y, l) in list(itertools.product(params_y_scale, params_length_scale))]} |
|
|
|
clf = GridSearchCV(clf, param_grid, cv=min(CV, x.shape[0]//2)) |
|
|
|
clf.fit(x, y.long()) |
|
pred = clf.predict_proba(test_x)[:, 1] |
|
metric = metric_used(test_y.cpu().numpy(), pred) |
|
|
|
return metric, pred |
|
|
|
|
|
|
|
|
|
param_grid['tabnet'] = {'n_d': [2, 4], 'n_steps': [2,4,6], 'gamma': [1.3], 'optimizer_params': [{'lr': 2e-2}, {'lr': 2e-1}]} |
|
|
|
def tabnet_metric(x, y, test_x, test_y, cat_features): |
|
x, y, test_x, test_y = x.cpu().numpy(), y.cpu().numpy(), test_x.cpu().numpy(), test_y.cpu().numpy() |
|
|
|
mean_metrics = [] |
|
mean_best_epochs = [] |
|
|
|
for params in list(ParameterGrid(param_grid['tabnet'])): |
|
kf = KFold(n_splits=min(5, x.shape[0]//2), random_state=None, shuffle=False) |
|
metrics = [] |
|
best_epochs = [] |
|
for train_index, test_index in kf.split(x): |
|
X_train, X_valid, y_train, y_valid = x[train_index], x[test_index], y[train_index], y[test_index] |
|
|
|
clf = TabNetClassifier(verbose=True, cat_idxs=cat_features, n_a=params['n_d'], **params) |
|
|
|
clf.fit( |
|
X_train, y_train, |
|
|
|
) |
|
|
|
metric = metric_used(test_y.cpu().numpy(), clf.predict(X_valid)) |
|
metrics += [metric] |
|
|
|
mean_metrics += [np.array(metrics).mean()] |
|
|
|
|
|
mean_metrics = np.array(mean_metrics) |
|
|
|
params_used = np.array(list(ParameterGrid(param_grid['tabnet']))) |
|
|
|
best_idx = np.argmax(mean_metrics) |
|
|
|
clf = TabNetClassifier(cat_idxs=cat_features, **params_used[best_idx]) |
|
|
|
clf.fit( |
|
x, y |
|
) |
|
|
|
pred = 1 - clf.predict_proba(test_x)[:,0] |
|
metric = metric_used(test_y, pred) |
|
|
|
|
|
|
|
return metric, pred |
|
|
|
|
|
|
|
param_grid['catboost'] = {'learning_rate': [0.1, 0.5, 1.0], |
|
'depth': [2, 4, 7], |
|
'l2_leaf_reg': [0.0, 0.5, 1], |
|
'iterations': [10, 40, 70], |
|
'loss_function': ['Logloss']} |
|
def catboost_metric(x, y, test_x, test_y, categorical_feats): |
|
import warnings |
|
def warn(*args, **kwargs): |
|
pass |
|
|
|
warnings.warn = warn |
|
|
|
x, y, test_x, test_y = x.numpy(), y.numpy(), test_x.numpy(), test_y.numpy() |
|
|
|
def make_pd_from_np(x): |
|
data = pd.DataFrame(x) |
|
for c in categorical_feats: |
|
data.iloc[:, c] = data.iloc[:, c].astype('int') |
|
return data |
|
|
|
x = make_pd_from_np(x) |
|
test_x = make_pd_from_np(test_x) |
|
|
|
model = CatBoostClassifier(iterations=2, |
|
depth=2, |
|
learning_rate=1, |
|
loss_function='Logloss', |
|
logging_level='Silent') |
|
|
|
grid_search_result = model.grid_search(param_grid['catboost'], |
|
X=x, |
|
y=y, |
|
cv=5, |
|
plot=False, |
|
verbose=False) |
|
|
|
|
|
pred = model.predict_proba(test_x)[:, 1] |
|
metric = metric_used(test_y.cpu().numpy(), pred) |
|
|
|
return metric, pred |
|
|
|
|
|
|
|
param_grid['xgb'] = { |
|
'min_child_weight': [0.5, 1.0], |
|
'learning_rate': [0.02, 0.2], |
|
|
|
'subsample': [0.5, 0.8], |
|
'max_depth': [1, 2], |
|
'colsample_bytree': [0.8], |
|
'eval_metric': ['logloss'], |
|
'n_estimators': [100] |
|
} |
|
def xgb_metric(x, y, test_x, test_y, cat_features): |
|
x, y, test_x, test_y = x.numpy(), y.numpy().astype(int), test_x.numpy(), test_y.numpy().astype(int) |
|
|
|
clf = xgb.XGBClassifier(use_label_encoder=False) |
|
|
|
|
|
|
|
clf = GridSearchCV(clf, param_grid['xgb'], cv=5, n_jobs=4, verbose=2) |
|
|
|
clf.fit(x, y.astype(int)) |
|
|
|
print(clf.best_params_) |
|
|
|
|
|
pred = clf.predict_proba(test_x)[:, 1] |
|
metrics = ((pred > 0.5) == test_y).astype(float).mean() |
|
return metrics, pred |
|
|
|
def get_default_spec(test_datasets, valid_datasets): |
|
bptt = 100 |
|
eval_positions = [30] |
|
max_features = max([X.shape[1] for (_, X, _, _) in test_datasets] + [X.shape[1] for (_, X, _, _) in valid_datasets]) |
|
max_samples = 20 |
|
|
|
return bptt, eval_positions, max_features, max_samples |
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--method', default='ridge', type=str) |
|
parser.add_argument('--did', default=-1, type=int) |
|
parser.add_argument('--overwrite', default=False, type=bool) |
|
args = parser.parse_args() |
|
|
|
test_datasets, _ = load_openml_list(test_dids_classification) |
|
valid_datasets, _ = load_openml_list(valid_dids_classification) |
|
|
|
selector = 'test' |
|
ds = valid_datasets if selector == 'valid' else test_datasets |
|
if args.did > -1: |
|
ds = ds[args.did:args.did+1] |
|
|
|
bptt, eval_positions, max_features, max_samples = get_default_spec(test_datasets, valid_datasets) |
|
|
|
if args.method == 'bayes': |
|
clf = bayes_net_metric |
|
device = 'cpu' |
|
elif args.method == 'gp': |
|
clf = gp_metric |
|
device = 'cpu' |
|
elif args.method == 'ridge': |
|
clf = ridge_metric |
|
device = 'cpu' |
|
elif args.method == 'knn': |
|
clf = knn_metric |
|
device = 'cpu' |
|
elif args.method == 'catboost': |
|
clf = catboost_metric |
|
device = 'cpu' |
|
elif args.method == 'tabnet': |
|
clf = tabnet_metric |
|
device = 'cpu' |
|
elif args.method == 'xgb': |
|
|
|
clf = xgb_metric |
|
device = 'cpu' |
|
elif args.method == 'logistic': |
|
clf = logistic_metric |
|
device = 'cpu' |
|
else: |
|
clf = None |
|
device = 'cpu' |
|
|
|
start_time = time.time() |
|
result = evaluate(ds, clf, args.method, bptt, eval_positions, device=device, max_samples=max_samples, overwrite=args.overwrite, save=True) |
|
result['time_spent'] = time.time() - start_time |
|
|
|
with open(f'/home/hollmann/prior-fitting/results/tabular/results_{selector}_{args.method}.npy', 'wb') as f: |
|
np.save(f, result) |