Spaces:
Running
Running
tttc3
commited on
Commit
·
3ef5500
1
Parent(s):
a62a370
Added control of random_state for numpy and julia
Browse files- pysr/sr.py +44 -18
pysr/sr.py
CHANGED
@@ -2,6 +2,7 @@ import os
|
|
2 |
import sys
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
|
|
5 |
import sympy
|
6 |
from sympy import sympify
|
7 |
import re
|
@@ -172,6 +173,10 @@ def best_callable(*args, **kwargs): # pragma: no cover
|
|
172 |
)
|
173 |
|
174 |
|
|
|
|
|
|
|
|
|
175 |
class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
176 |
"""
|
177 |
High-performance symbolic regression.
|
@@ -422,6 +427,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
422 |
What precision to use for the data. By default this is 32
|
423 |
(float32), but you can select 64 or 16 as well.
|
424 |
|
|
|
|
|
|
|
|
|
425 |
verbosity : int, default=1e9
|
426 |
What verbosity level to use. 0 means minimal print statements.
|
427 |
|
@@ -566,9 +575,6 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
566 |
array([-1.15907818, -1.15907818, -1.15907818, -1.15907818, -1.15907818])
|
567 |
"""
|
568 |
|
569 |
-
# Class validation constants
|
570 |
-
VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
|
571 |
-
|
572 |
def __init__(
|
573 |
self,
|
574 |
model_selection="best",
|
@@ -626,6 +632,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
626 |
batch_size=50,
|
627 |
fast_cycle=False,
|
628 |
precision=32,
|
|
|
629 |
verbosity=1e9,
|
630 |
update_verbosity=None,
|
631 |
progress=True,
|
@@ -709,6 +716,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
709 |
self.batch_size = batch_size
|
710 |
self.fast_cycle = fast_cycle
|
711 |
self.precision = precision
|
|
|
712 |
# Additional runtime parameters
|
713 |
# - Runtime user interface
|
714 |
self.verbosity = verbosity
|
@@ -940,9 +948,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
940 |
)
|
941 |
|
942 |
# NotImplementedError - Values that could be supported at a later time
|
943 |
-
if self.optimizer_algorithm not in
|
944 |
raise NotImplementedError(
|
945 |
-
f"PySR currently only supports the following optimizer algorithms: {
|
946 |
)
|
947 |
|
948 |
if isinstance(X, pd.DataFrame):
|
@@ -988,7 +996,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
988 |
|
989 |
return X, y, Xresampled, variable_names
|
990 |
|
991 |
-
def _pre_transform_training_data(
|
|
|
|
|
992 |
"""
|
993 |
Transforms the training data before fitting the symbolic regressor.
|
994 |
|
@@ -1009,6 +1019,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
1009 |
variable_names : list[str] of length n_features
|
1010 |
Names of each variable in the training dataset, `X`.
|
1011 |
|
|
|
|
|
|
|
|
|
1012 |
Returns
|
1013 |
-------
|
1014 |
X_transformed : ndarray of shape (n_samples, n_features)
|
@@ -1031,7 +1045,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
1031 |
"""
|
1032 |
# Feature selection transformation
|
1033 |
if self.select_k_features:
|
1034 |
-
self.selection_mask_ = run_feature_selection(
|
|
|
|
|
1035 |
X = X[:, self.selection_mask_]
|
1036 |
|
1037 |
if Xresampled is not None:
|
@@ -1051,7 +1067,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
1051 |
if self.nout_ > 1:
|
1052 |
y = np.stack(
|
1053 |
[
|
1054 |
-
_denoise(
|
|
|
|
|
1055 |
for i in range(self.nout_)
|
1056 |
],
|
1057 |
axis=1,
|
@@ -1059,11 +1077,11 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
1059 |
if Xresampled is not None:
|
1060 |
X = Xresampled
|
1061 |
else:
|
1062 |
-
X, y = _denoise(X, y, Xresampled=Xresampled)
|
1063 |
|
1064 |
return X, y, variable_names
|
1065 |
|
1066 |
-
def _run(self, X, y, weights):
|
1067 |
"""
|
1068 |
Run the symbolic regression fitting process on the julia backend.
|
1069 |
|
@@ -1245,7 +1263,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
1245 |
]
|
1246 |
|
1247 |
# Call to Julia backend.
|
1248 |
-
# See https://github.com/
|
1249 |
options = Main.Options(
|
1250 |
binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")),
|
1251 |
unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")),
|
@@ -1294,6 +1312,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
1294 |
skip_mutation_failures=self.skip_mutation_failures,
|
1295 |
max_evals=self.max_evals,
|
1296 |
earlyStopCondition=self.early_stop_condition,
|
|
|
1297 |
)
|
1298 |
|
1299 |
# Convert data to desired precision
|
@@ -1316,7 +1335,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
1316 |
cprocs = 0 if multithreading else self.procs
|
1317 |
|
1318 |
# Call to Julia backend.
|
1319 |
-
# See https://github.com/
|
1320 |
self.raw_julia_state_ = Main.EquationSearch(
|
1321 |
Main.X,
|
1322 |
Main.y,
|
@@ -1390,6 +1409,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
1390 |
self.selection_mask_ = None
|
1391 |
self.raw_julia_state_ = None
|
1392 |
|
|
|
|
|
|
|
1393 |
self._setup_equation_file()
|
1394 |
|
1395 |
# Parameter input validation (for parameters defined in __init__)
|
@@ -1410,7 +1432,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
1410 |
|
1411 |
# Pre transformations (feature selection and denoising)
|
1412 |
X, y, variable_names = self._pre_transform_training_data(
|
1413 |
-
X, y, Xresampled, variable_names
|
1414 |
)
|
1415 |
|
1416 |
# Warn about large feature counts (still warn if feature count is large
|
@@ -1443,7 +1465,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
1443 |
|
1444 |
# Fitting procedure
|
1445 |
if not from_equation_file:
|
1446 |
-
self._run(X=X, y=y, weights=weights)
|
1447 |
else:
|
1448 |
self.equations_ = self.get_hof()
|
1449 |
|
@@ -1790,13 +1812,15 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
1790 |
return ret_outputs[0]
|
1791 |
|
1792 |
|
1793 |
-
def _denoise(X, y, Xresampled=None):
|
1794 |
"""Denoise the dataset using a Gaussian process"""
|
1795 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
1796 |
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
|
1797 |
|
1798 |
gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
|
1799 |
-
gpr = GaussianProcessRegressor(
|
|
|
|
|
1800 |
gpr.fit(X, y)
|
1801 |
if Xresampled is not None:
|
1802 |
return Xresampled, gpr.predict(Xresampled)
|
@@ -1816,7 +1840,7 @@ def _handle_feature_selection(X, select_k_features, y, variable_names):
|
|
1816 |
return X, selection
|
1817 |
|
1818 |
|
1819 |
-
def run_feature_selection(X, y, select_k_features):
|
1820 |
"""
|
1821 |
Use a gradient boosting tree regressor as a proxy for finding
|
1822 |
the k most important features in X, returning indices for those
|
@@ -1825,7 +1849,9 @@ def run_feature_selection(X, y, select_k_features):
|
|
1825 |
from sklearn.ensemble import RandomForestRegressor
|
1826 |
from sklearn.feature_selection import SelectFromModel
|
1827 |
|
1828 |
-
clf = RandomForestRegressor(
|
|
|
|
|
1829 |
clf.fit(X, y)
|
1830 |
selector = SelectFromModel(
|
1831 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
|
|
2 |
import sys
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
+
from sklearn.utils import check_array, check_random_state
|
6 |
import sympy
|
7 |
from sympy import sympify
|
8 |
import re
|
|
|
173 |
)
|
174 |
|
175 |
|
176 |
+
# Class validation constants
|
177 |
+
VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
|
178 |
+
|
179 |
+
|
180 |
class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
181 |
"""
|
182 |
High-performance symbolic regression.
|
|
|
427 |
What precision to use for the data. By default this is 32
|
428 |
(float32), but you can select 64 or 16 as well.
|
429 |
|
430 |
+
random_state : int, Numpy RandomState instance or None, default=None
|
431 |
+
Pass an int for reproducible results across multiple function calls.
|
432 |
+
See :term:`Glossary <random_state>`.
|
433 |
+
|
434 |
verbosity : int, default=1e9
|
435 |
What verbosity level to use. 0 means minimal print statements.
|
436 |
|
|
|
575 |
array([-1.15907818, -1.15907818, -1.15907818, -1.15907818, -1.15907818])
|
576 |
"""
|
577 |
|
|
|
|
|
|
|
578 |
def __init__(
|
579 |
self,
|
580 |
model_selection="best",
|
|
|
632 |
batch_size=50,
|
633 |
fast_cycle=False,
|
634 |
precision=32,
|
635 |
+
random_state=None,
|
636 |
verbosity=1e9,
|
637 |
update_verbosity=None,
|
638 |
progress=True,
|
|
|
716 |
self.batch_size = batch_size
|
717 |
self.fast_cycle = fast_cycle
|
718 |
self.precision = precision
|
719 |
+
self.random_state = random_state
|
720 |
# Additional runtime parameters
|
721 |
# - Runtime user interface
|
722 |
self.verbosity = verbosity
|
|
|
948 |
)
|
949 |
|
950 |
# NotImplementedError - Values that could be supported at a later time
|
951 |
+
if self.optimizer_algorithm not in VALID_OPTIMIZER_ALGORITHMS:
|
952 |
raise NotImplementedError(
|
953 |
+
f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
|
954 |
)
|
955 |
|
956 |
if isinstance(X, pd.DataFrame):
|
|
|
996 |
|
997 |
return X, y, Xresampled, variable_names
|
998 |
|
999 |
+
def _pre_transform_training_data(
|
1000 |
+
self, X, y, Xresampled, variable_names, random_state
|
1001 |
+
):
|
1002 |
"""
|
1003 |
Transforms the training data before fitting the symbolic regressor.
|
1004 |
|
|
|
1019 |
variable_names : list[str] of length n_features
|
1020 |
Names of each variable in the training dataset, `X`.
|
1021 |
|
1022 |
+
random_state : int, Numpy RandomState instance or None, default=None
|
1023 |
+
Pass an int for reproducible results across multiple function calls.
|
1024 |
+
See :term:`Glossary <random_state>`.
|
1025 |
+
|
1026 |
Returns
|
1027 |
-------
|
1028 |
X_transformed : ndarray of shape (n_samples, n_features)
|
|
|
1045 |
"""
|
1046 |
# Feature selection transformation
|
1047 |
if self.select_k_features:
|
1048 |
+
self.selection_mask_ = run_feature_selection(
|
1049 |
+
X, y, self.select_k_features, random_state=random_state
|
1050 |
+
)
|
1051 |
X = X[:, self.selection_mask_]
|
1052 |
|
1053 |
if Xresampled is not None:
|
|
|
1067 |
if self.nout_ > 1:
|
1068 |
y = np.stack(
|
1069 |
[
|
1070 |
+
_denoise(
|
1071 |
+
X, y[:, i], Xresampled=Xresampled, random_state=random_state
|
1072 |
+
)[1]
|
1073 |
for i in range(self.nout_)
|
1074 |
],
|
1075 |
axis=1,
|
|
|
1077 |
if Xresampled is not None:
|
1078 |
X = Xresampled
|
1079 |
else:
|
1080 |
+
X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
1081 |
|
1082 |
return X, y, variable_names
|
1083 |
|
1084 |
+
def _run(self, X, y, weights, seed):
|
1085 |
"""
|
1086 |
Run the symbolic regression fitting process on the julia backend.
|
1087 |
|
|
|
1263 |
]
|
1264 |
|
1265 |
# Call to Julia backend.
|
1266 |
+
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
|
1267 |
options = Main.Options(
|
1268 |
binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")),
|
1269 |
unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")),
|
|
|
1312 |
skip_mutation_failures=self.skip_mutation_failures,
|
1313 |
max_evals=self.max_evals,
|
1314 |
earlyStopCondition=self.early_stop_condition,
|
1315 |
+
seed=seed,
|
1316 |
)
|
1317 |
|
1318 |
# Convert data to desired precision
|
|
|
1335 |
cprocs = 0 if multithreading else self.procs
|
1336 |
|
1337 |
# Call to Julia backend.
|
1338 |
+
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl
|
1339 |
self.raw_julia_state_ = Main.EquationSearch(
|
1340 |
Main.X,
|
1341 |
Main.y,
|
|
|
1409 |
self.selection_mask_ = None
|
1410 |
self.raw_julia_state_ = None
|
1411 |
|
1412 |
+
random_state = check_random_state(self.random_state) # For np random
|
1413 |
+
seed = random_state.get_state()[1][0] # For julia random
|
1414 |
+
|
1415 |
self._setup_equation_file()
|
1416 |
|
1417 |
# Parameter input validation (for parameters defined in __init__)
|
|
|
1432 |
|
1433 |
# Pre transformations (feature selection and denoising)
|
1434 |
X, y, variable_names = self._pre_transform_training_data(
|
1435 |
+
X, y, Xresampled, variable_names, random_state
|
1436 |
)
|
1437 |
|
1438 |
# Warn about large feature counts (still warn if feature count is large
|
|
|
1465 |
|
1466 |
# Fitting procedure
|
1467 |
if not from_equation_file:
|
1468 |
+
self._run(X=X, y=y, weights=weights, seed=seed)
|
1469 |
else:
|
1470 |
self.equations_ = self.get_hof()
|
1471 |
|
|
|
1812 |
return ret_outputs[0]
|
1813 |
|
1814 |
|
1815 |
+
def _denoise(X, y, Xresampled=None, random_state=None):
|
1816 |
"""Denoise the dataset using a Gaussian process"""
|
1817 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
1818 |
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
|
1819 |
|
1820 |
gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
|
1821 |
+
gpr = GaussianProcessRegressor(
|
1822 |
+
kernel=gp_kernel, n_restarts_optimizer=50, random_state=random_state
|
1823 |
+
)
|
1824 |
gpr.fit(X, y)
|
1825 |
if Xresampled is not None:
|
1826 |
return Xresampled, gpr.predict(Xresampled)
|
|
|
1840 |
return X, selection
|
1841 |
|
1842 |
|
1843 |
+
def run_feature_selection(X, y, select_k_features, random_state=None):
|
1844 |
"""
|
1845 |
Use a gradient boosting tree regressor as a proxy for finding
|
1846 |
the k most important features in X, returning indices for those
|
|
|
1849 |
from sklearn.ensemble import RandomForestRegressor
|
1850 |
from sklearn.feature_selection import SelectFromModel
|
1851 |
|
1852 |
+
clf = RandomForestRegressor(
|
1853 |
+
n_estimators=100, max_depth=3, random_state=random_state
|
1854 |
+
)
|
1855 |
clf.fit(X, y)
|
1856 |
selector = SelectFromModel(
|
1857 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|