MilesCranmer commited on
Commit
c822df8
·
1 Parent(s): 12e6d5e

Move feature selection functionality to separate file

Browse files
Files changed (3) hide show
  1. pysr/feature_selection.py +35 -0
  2. pysr/sr.py +1 -33
  3. pysr/test/test.py +3 -8
pysr/feature_selection.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Functions for doing feature selection during preprocessing."""
2
+ import numpy as np
3
+
4
+
5
+ def run_feature_selection(X, y, select_k_features, random_state=None) -> np.ndarray:
6
+ """
7
+ Find most important features.
8
+
9
+ Uses a gradient boosting tree regressor as a proxy for finding
10
+ the k most important features in X, returning indices for those
11
+ features as output.
12
+ """
13
+ from sklearn.ensemble import RandomForestRegressor
14
+ from sklearn.feature_selection import SelectFromModel
15
+
16
+ clf = RandomForestRegressor(
17
+ n_estimators=100, max_depth=3, random_state=random_state
18
+ )
19
+ clf.fit(X, y)
20
+ selector = SelectFromModel(
21
+ clf, threshold=-np.inf, max_features=select_k_features, prefit=True
22
+ )
23
+ return selector.get_support(indices=True)
24
+
25
+
26
+ # Function has not been removed only due to usage in module tests
27
+ def _handle_feature_selection(X, select_k_features, y, variable_names):
28
+ if select_k_features is not None:
29
+ selection = run_feature_selection(X, y, select_k_features)
30
+ print(f"Using features {[variable_names[i] for i in selection]}")
31
+ X = X[:, selection]
32
+ else:
33
+ selection = None
34
+
35
+ return X, selection
pysr/sr.py CHANGED
@@ -25,6 +25,7 @@ from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable
25
  from .export_numpy import sympy2numpy
26
  from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
27
  from .export_torch import sympy2torch
 
28
  from .julia_helpers import (
29
  _escape_filename,
30
  _load_backend,
@@ -2385,36 +2386,3 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
2385
  f"{model_selection} is not a valid model selection strategy."
2386
  )
2387
  return chosen_idx
2388
-
2389
-
2390
- # Function has not been removed only due to usage in module tests
2391
- def _handle_feature_selection(X, select_k_features, y, variable_names):
2392
- if select_k_features is not None:
2393
- selection = run_feature_selection(X, y, select_k_features)
2394
- print(f"Using features {[variable_names[i] for i in selection]}")
2395
- X = X[:, selection]
2396
-
2397
- else:
2398
- selection = None
2399
- return X, selection
2400
-
2401
-
2402
- def run_feature_selection(X, y, select_k_features, random_state=None):
2403
- """
2404
- Find most important features.
2405
-
2406
- Uses a gradient boosting tree regressor as a proxy for finding
2407
- the k most important features in X, returning indices for those
2408
- features as output.
2409
- """
2410
- from sklearn.ensemble import RandomForestRegressor
2411
- from sklearn.feature_selection import SelectFromModel
2412
-
2413
- clf = RandomForestRegressor(
2414
- n_estimators=100, max_depth=3, random_state=random_state
2415
- )
2416
- clf.fit(X, y)
2417
- selector = SelectFromModel(
2418
- clf, threshold=-np.inf, max_features=select_k_features, prefit=True
2419
- )
2420
- return selector.get_support(indices=True)
 
25
  from .export_numpy import sympy2numpy
26
  from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
27
  from .export_torch import sympy2torch
28
+ from .feature_selection import run_feature_selection
29
  from .julia_helpers import (
30
  _escape_filename,
31
  _load_backend,
 
2386
  f"{model_selection} is not a valid model selection strategy."
2387
  )
2388
  return chosen_idx
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pysr/test/test.py CHANGED
@@ -14,14 +14,9 @@ from sklearn.utils.estimator_checks import check_estimator
14
 
15
  from .. import PySRRegressor, julia_helpers
16
  from ..export_latex import sympy2latex
17
- from ..sr import (
18
- _check_assertions,
19
- _csv_filename_to_pkl_filename,
20
- _handle_feature_selection,
21
- _process_constraints,
22
- idx_model_selection,
23
- run_feature_selection,
24
- )
25
 
26
  DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
27
  DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
 
14
 
15
  from .. import PySRRegressor, julia_helpers
16
  from ..export_latex import sympy2latex
17
+ from ..feature_selection import _handle_feature_selection, run_feature_selection
18
+ from ..sr import _check_assertions, _process_constraints, idx_model_selection
19
+ from ..utils import _csv_filename_to_pkl_filename
 
 
 
 
 
20
 
21
  DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
22
  DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default