tttc3 commited on
Commit
3ef5500
·
1 Parent(s): a62a370

Added control of random_state for numpy and julia

Browse files
Files changed (1) hide show
  1. pysr/sr.py +44 -18
pysr/sr.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import sys
3
  import numpy as np
4
  import pandas as pd
 
5
  import sympy
6
  from sympy import sympify
7
  import re
@@ -172,6 +173,10 @@ def best_callable(*args, **kwargs): # pragma: no cover
172
  )
173
 
174
 
 
 
 
 
175
  class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
176
  """
177
  High-performance symbolic regression.
@@ -422,6 +427,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
422
  What precision to use for the data. By default this is 32
423
  (float32), but you can select 64 or 16 as well.
424
 
 
 
 
 
425
  verbosity : int, default=1e9
426
  What verbosity level to use. 0 means minimal print statements.
427
 
@@ -566,9 +575,6 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
566
  array([-1.15907818, -1.15907818, -1.15907818, -1.15907818, -1.15907818])
567
  """
568
 
569
- # Class validation constants
570
- VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
571
-
572
  def __init__(
573
  self,
574
  model_selection="best",
@@ -626,6 +632,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
626
  batch_size=50,
627
  fast_cycle=False,
628
  precision=32,
 
629
  verbosity=1e9,
630
  update_verbosity=None,
631
  progress=True,
@@ -709,6 +716,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
709
  self.batch_size = batch_size
710
  self.fast_cycle = fast_cycle
711
  self.precision = precision
 
712
  # Additional runtime parameters
713
  # - Runtime user interface
714
  self.verbosity = verbosity
@@ -940,9 +948,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
940
  )
941
 
942
  # NotImplementedError - Values that could be supported at a later time
943
- if self.optimizer_algorithm not in self.VALID_OPTIMIZER_ALGORITHMS:
944
  raise NotImplementedError(
945
- f"PySR currently only supports the following optimizer algorithms: {self.VALID_OPTIMIZER_ALGORITHMS}"
946
  )
947
 
948
  if isinstance(X, pd.DataFrame):
@@ -988,7 +996,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
988
 
989
  return X, y, Xresampled, variable_names
990
 
991
- def _pre_transform_training_data(self, X, y, Xresampled, variable_names):
 
 
992
  """
993
  Transforms the training data before fitting the symbolic regressor.
994
 
@@ -1009,6 +1019,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1009
  variable_names : list[str] of length n_features
1010
  Names of each variable in the training dataset, `X`.
1011
 
 
 
 
 
1012
  Returns
1013
  -------
1014
  X_transformed : ndarray of shape (n_samples, n_features)
@@ -1031,7 +1045,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1031
  """
1032
  # Feature selection transformation
1033
  if self.select_k_features:
1034
- self.selection_mask_ = run_feature_selection(X, y, self.select_k_features)
 
 
1035
  X = X[:, self.selection_mask_]
1036
 
1037
  if Xresampled is not None:
@@ -1051,7 +1067,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1051
  if self.nout_ > 1:
1052
  y = np.stack(
1053
  [
1054
- _denoise(X, y[:, i], Xresampled=Xresampled)[1]
 
 
1055
  for i in range(self.nout_)
1056
  ],
1057
  axis=1,
@@ -1059,11 +1077,11 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1059
  if Xresampled is not None:
1060
  X = Xresampled
1061
  else:
1062
- X, y = _denoise(X, y, Xresampled=Xresampled)
1063
 
1064
  return X, y, variable_names
1065
 
1066
- def _run(self, X, y, weights):
1067
  """
1068
  Run the symbolic regression fitting process on the julia backend.
1069
 
@@ -1245,7 +1263,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1245
  ]
1246
 
1247
  # Call to Julia backend.
1248
- # See https://github.com/search?q=%22function+Options%22+repo%3AMilesCranmer%2FSymbolicRegression.jl+path%3A%2Fsrc%2F+filename%3AOptions.jl+language%3AJulia&type=Code
1249
  options = Main.Options(
1250
  binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")),
1251
  unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")),
@@ -1294,6 +1312,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1294
  skip_mutation_failures=self.skip_mutation_failures,
1295
  max_evals=self.max_evals,
1296
  earlyStopCondition=self.early_stop_condition,
 
1297
  )
1298
 
1299
  # Convert data to desired precision
@@ -1316,7 +1335,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1316
  cprocs = 0 if multithreading else self.procs
1317
 
1318
  # Call to Julia backend.
1319
- # See https://github.com/search?q=%22function+EquationSearch%22+repo%3AMilesCranmer%2FSymbolicRegression.jl+path%3A%2Fsrc%2F+filename%3ASymbolicRegression.jl+language%3AJulia&type=Code
1320
  self.raw_julia_state_ = Main.EquationSearch(
1321
  Main.X,
1322
  Main.y,
@@ -1390,6 +1409,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1390
  self.selection_mask_ = None
1391
  self.raw_julia_state_ = None
1392
 
 
 
 
1393
  self._setup_equation_file()
1394
 
1395
  # Parameter input validation (for parameters defined in __init__)
@@ -1410,7 +1432,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1410
 
1411
  # Pre transformations (feature selection and denoising)
1412
  X, y, variable_names = self._pre_transform_training_data(
1413
- X, y, Xresampled, variable_names
1414
  )
1415
 
1416
  # Warn about large feature counts (still warn if feature count is large
@@ -1443,7 +1465,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1443
 
1444
  # Fitting procedure
1445
  if not from_equation_file:
1446
- self._run(X=X, y=y, weights=weights)
1447
  else:
1448
  self.equations_ = self.get_hof()
1449
 
@@ -1790,13 +1812,15 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1790
  return ret_outputs[0]
1791
 
1792
 
1793
- def _denoise(X, y, Xresampled=None):
1794
  """Denoise the dataset using a Gaussian process"""
1795
  from sklearn.gaussian_process import GaussianProcessRegressor
1796
  from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
1797
 
1798
  gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
1799
- gpr = GaussianProcessRegressor(kernel=gp_kernel, n_restarts_optimizer=50)
 
 
1800
  gpr.fit(X, y)
1801
  if Xresampled is not None:
1802
  return Xresampled, gpr.predict(Xresampled)
@@ -1816,7 +1840,7 @@ def _handle_feature_selection(X, select_k_features, y, variable_names):
1816
  return X, selection
1817
 
1818
 
1819
- def run_feature_selection(X, y, select_k_features):
1820
  """
1821
  Use a gradient boosting tree regressor as a proxy for finding
1822
  the k most important features in X, returning indices for those
@@ -1825,7 +1849,9 @@ def run_feature_selection(X, y, select_k_features):
1825
  from sklearn.ensemble import RandomForestRegressor
1826
  from sklearn.feature_selection import SelectFromModel
1827
 
1828
- clf = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=0)
 
 
1829
  clf.fit(X, y)
1830
  selector = SelectFromModel(
1831
  clf, threshold=-np.inf, max_features=select_k_features, prefit=True
 
2
  import sys
3
  import numpy as np
4
  import pandas as pd
5
+ from sklearn.utils import check_array, check_random_state
6
  import sympy
7
  from sympy import sympify
8
  import re
 
173
  )
174
 
175
 
176
+ # Class validation constants
177
+ VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
178
+
179
+
180
  class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
181
  """
182
  High-performance symbolic regression.
 
427
  What precision to use for the data. By default this is 32
428
  (float32), but you can select 64 or 16 as well.
429
 
430
+ random_state : int, Numpy RandomState instance or None, default=None
431
+ Pass an int for reproducible results across multiple function calls.
432
+ See :term:`Glossary <random_state>`.
433
+
434
  verbosity : int, default=1e9
435
  What verbosity level to use. 0 means minimal print statements.
436
 
 
575
  array([-1.15907818, -1.15907818, -1.15907818, -1.15907818, -1.15907818])
576
  """
577
 
 
 
 
578
  def __init__(
579
  self,
580
  model_selection="best",
 
632
  batch_size=50,
633
  fast_cycle=False,
634
  precision=32,
635
+ random_state=None,
636
  verbosity=1e9,
637
  update_verbosity=None,
638
  progress=True,
 
716
  self.batch_size = batch_size
717
  self.fast_cycle = fast_cycle
718
  self.precision = precision
719
+ self.random_state = random_state
720
  # Additional runtime parameters
721
  # - Runtime user interface
722
  self.verbosity = verbosity
 
948
  )
949
 
950
  # NotImplementedError - Values that could be supported at a later time
951
+ if self.optimizer_algorithm not in VALID_OPTIMIZER_ALGORITHMS:
952
  raise NotImplementedError(
953
+ f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
954
  )
955
 
956
  if isinstance(X, pd.DataFrame):
 
996
 
997
  return X, y, Xresampled, variable_names
998
 
999
+ def _pre_transform_training_data(
1000
+ self, X, y, Xresampled, variable_names, random_state
1001
+ ):
1002
  """
1003
  Transforms the training data before fitting the symbolic regressor.
1004
 
 
1019
  variable_names : list[str] of length n_features
1020
  Names of each variable in the training dataset, `X`.
1021
 
1022
+ random_state : int, Numpy RandomState instance or None, default=None
1023
+ Pass an int for reproducible results across multiple function calls.
1024
+ See :term:`Glossary <random_state>`.
1025
+
1026
  Returns
1027
  -------
1028
  X_transformed : ndarray of shape (n_samples, n_features)
 
1045
  """
1046
  # Feature selection transformation
1047
  if self.select_k_features:
1048
+ self.selection_mask_ = run_feature_selection(
1049
+ X, y, self.select_k_features, random_state=random_state
1050
+ )
1051
  X = X[:, self.selection_mask_]
1052
 
1053
  if Xresampled is not None:
 
1067
  if self.nout_ > 1:
1068
  y = np.stack(
1069
  [
1070
+ _denoise(
1071
+ X, y[:, i], Xresampled=Xresampled, random_state=random_state
1072
+ )[1]
1073
  for i in range(self.nout_)
1074
  ],
1075
  axis=1,
 
1077
  if Xresampled is not None:
1078
  X = Xresampled
1079
  else:
1080
+ X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
1081
 
1082
  return X, y, variable_names
1083
 
1084
+ def _run(self, X, y, weights, seed):
1085
  """
1086
  Run the symbolic regression fitting process on the julia backend.
1087
 
 
1263
  ]
1264
 
1265
  # Call to Julia backend.
1266
+ # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
1267
  options = Main.Options(
1268
  binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")),
1269
  unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")),
 
1312
  skip_mutation_failures=self.skip_mutation_failures,
1313
  max_evals=self.max_evals,
1314
  earlyStopCondition=self.early_stop_condition,
1315
+ seed=seed,
1316
  )
1317
 
1318
  # Convert data to desired precision
 
1335
  cprocs = 0 if multithreading else self.procs
1336
 
1337
  # Call to Julia backend.
1338
+ # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl
1339
  self.raw_julia_state_ = Main.EquationSearch(
1340
  Main.X,
1341
  Main.y,
 
1409
  self.selection_mask_ = None
1410
  self.raw_julia_state_ = None
1411
 
1412
+ random_state = check_random_state(self.random_state) # For np random
1413
+ seed = random_state.get_state()[1][0] # For julia random
1414
+
1415
  self._setup_equation_file()
1416
 
1417
  # Parameter input validation (for parameters defined in __init__)
 
1432
 
1433
  # Pre transformations (feature selection and denoising)
1434
  X, y, variable_names = self._pre_transform_training_data(
1435
+ X, y, Xresampled, variable_names, random_state
1436
  )
1437
 
1438
  # Warn about large feature counts (still warn if feature count is large
 
1465
 
1466
  # Fitting procedure
1467
  if not from_equation_file:
1468
+ self._run(X=X, y=y, weights=weights, seed=seed)
1469
  else:
1470
  self.equations_ = self.get_hof()
1471
 
 
1812
  return ret_outputs[0]
1813
 
1814
 
1815
+ def _denoise(X, y, Xresampled=None, random_state=None):
1816
  """Denoise the dataset using a Gaussian process"""
1817
  from sklearn.gaussian_process import GaussianProcessRegressor
1818
  from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
1819
 
1820
  gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
1821
+ gpr = GaussianProcessRegressor(
1822
+ kernel=gp_kernel, n_restarts_optimizer=50, random_state=random_state
1823
+ )
1824
  gpr.fit(X, y)
1825
  if Xresampled is not None:
1826
  return Xresampled, gpr.predict(Xresampled)
 
1840
  return X, selection
1841
 
1842
 
1843
+ def run_feature_selection(X, y, select_k_features, random_state=None):
1844
  """
1845
  Use a gradient boosting tree regressor as a proxy for finding
1846
  the k most important features in X, returning indices for those
 
1849
  from sklearn.ensemble import RandomForestRegressor
1850
  from sklearn.feature_selection import SelectFromModel
1851
 
1852
+ clf = RandomForestRegressor(
1853
+ n_estimators=100, max_depth=3, random_state=random_state
1854
+ )
1855
  clf.fit(X, y)
1856
  selector = SelectFromModel(
1857
  clf, threshold=-np.inf, max_features=select_k_features, prefit=True