Spaces:
Running
Running
File size: 1,171 Bytes
c822df8 b896bd3 c822df8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
"""Functions for doing feature selection during preprocessing."""
import numpy as np
def run_feature_selection(X, y, select_k_features, random_state=None):
"""
Find most important features.
Uses a gradient boosting tree regressor as a proxy for finding
the k most important features in X, returning indices for those
features as output.
"""
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
clf = RandomForestRegressor(
n_estimators=100, max_depth=3, random_state=random_state
)
clf.fit(X, y)
selector = SelectFromModel(
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
)
return selector.get_support(indices=True)
# Function has not been removed only due to usage in module tests
def _handle_feature_selection(X, select_k_features, y, variable_names):
if select_k_features is not None:
selection = run_feature_selection(X, y, select_k_features)
print(f"Using features {[variable_names[i] for i in selection]}")
X = X[:, selection]
else:
selection = None
return X, selection
|