Spaces:
Running
Running
MilesCranmer
commited on
refactor: more type declarations
Browse files- pysr/sr.py +51 -22
pysr/sr.py
CHANGED
@@ -21,9 +21,12 @@ else:
|
|
21 |
|
22 |
import numpy as np
|
23 |
import pandas as pd
|
|
|
|
|
24 |
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
|
25 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
26 |
-
from sklearn.utils.validation import _check_feature_names_in
|
|
|
27 |
|
28 |
from .denoising import denoise, multi_denoise
|
29 |
from .deprecated import DEPRECATED_KWARGS
|
@@ -179,6 +182,21 @@ VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
|
|
179 |
|
180 |
|
181 |
class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
"""
|
183 |
High-performance symbolic regression algorithm.
|
184 |
|
@@ -603,22 +621,17 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
603 |
Units of each variable in the training dataset, `y`.
|
604 |
nout_ : int
|
605 |
Number of output dimensions.
|
606 |
-
selection_mask_ :
|
607 |
-
|
608 |
-
`select_k_features` is set.
|
609 |
tempdir_ : Path
|
610 |
Path to the temporary equations directory.
|
611 |
-
equation_file_ : str
|
612 |
Output equation file name produced by the julia backend.
|
613 |
julia_state_stream_ : ndarray
|
614 |
The serialized state for the julia SymbolicRegression.jl backend (after fitting),
|
615 |
stored as an array of uint8, produced by Julia's Serialization.serialize function.
|
616 |
-
julia_state_
|
617 |
-
The deserialized state.
|
618 |
julia_options_stream_ : ndarray
|
619 |
The serialized julia options, stored as an array of uint8,
|
620 |
-
julia_options_
|
621 |
-
The deserialized julia options.
|
622 |
equation_file_contents_ : list[pandas.DataFrame]
|
623 |
Contents of the equation file output by the Julia backend.
|
624 |
show_pickle_warnings_ : bool
|
@@ -926,7 +939,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
926 |
Names of the features passed to the model.
|
927 |
Not needed if loading from a pickle file.
|
928 |
selection_mask : list[bool]
|
929 |
-
If using select_k_features
|
930 |
Not needed if loading from a pickle file.
|
931 |
nout : int
|
932 |
Number of outputs of the model.
|
@@ -1124,10 +1137,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1124 |
|
1125 |
@property
|
1126 |
def julia_options_(self):
|
|
|
1127 |
return jl_deserialize(self.julia_options_stream_)
|
1128 |
|
1129 |
@property
|
1130 |
def julia_state_(self):
|
|
|
1131 |
return jl_deserialize(self.julia_state_stream_)
|
1132 |
|
1133 |
@property
|
@@ -1140,7 +1155,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1140 |
)
|
1141 |
return self.julia_state_
|
1142 |
|
1143 |
-
def get_best(self, index=None):
|
1144 |
"""
|
1145 |
Get best equation using `model_selection`.
|
1146 |
|
@@ -1316,7 +1331,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1316 |
|
1317 |
def _validate_and_set_fit_params(
|
1318 |
self, X, y, Xresampled, weights, variable_names, X_units, y_units
|
1319 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1320 |
"""
|
1321 |
Validate the parameters passed to the :term`fit` method.
|
1322 |
|
@@ -1336,7 +1359,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1336 |
Weight array of the same shape as `y`.
|
1337 |
Each element is how to weight the mean-square-error loss
|
1338 |
for that particular element of y.
|
1339 |
-
variable_names :
|
1340 |
Names of each variable in the training dataset, `X`.
|
1341 |
X_units : list[str] of length n_features
|
1342 |
Units of each variable in the training dataset, `X`.
|
@@ -1392,7 +1415,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1392 |
if weights is not None:
|
1393 |
weights = check_array(weights, ensure_2d=False)
|
1394 |
check_consistent_length(weights, y)
|
1395 |
-
X, y = self.
|
1396 |
self.feature_names_in_ = _safe_check_feature_names_in(
|
1397 |
self, variable_names, generate_names=False
|
1398 |
)
|
@@ -1402,10 +1425,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1402 |
self.display_feature_names_in_ = np.array(
|
1403 |
[f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
1404 |
)
|
|
|
1405 |
else:
|
1406 |
self.display_feature_names_in_ = self.feature_names_in_
|
1407 |
-
|
1408 |
-
variable_names = self.feature_names_in_
|
1409 |
|
1410 |
# Handle multioutput data
|
1411 |
if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
|
@@ -1420,6 +1443,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1420 |
|
1421 |
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
1422 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1423 |
def _pre_transform_training_data(
|
1424 |
self, X, y, Xresampled, variable_names, X_units, y_units, random_state
|
1425 |
):
|
@@ -1489,7 +1518,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1489 |
self.X_units_ = copy.deepcopy(X_units)
|
1490 |
|
1491 |
# Re-perform data validation and feature name updating
|
1492 |
-
X, y = self.
|
1493 |
# Update feature names with selected variable names
|
1494 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
1495 |
self.display_feature_names_in_ = self.feature_names_in_
|
@@ -1506,7 +1535,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1506 |
|
1507 |
return X, y, variable_names, X_units, y_units
|
1508 |
|
1509 |
-
def _run(self, X, y, mutated_params, weights, seed):
|
1510 |
"""
|
1511 |
Run the symbolic regression fitting process on the julia backend.
|
1512 |
|
@@ -1784,9 +1813,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1784 |
y,
|
1785 |
Xresampled=None,
|
1786 |
weights=None,
|
1787 |
-
variable_names: Optional[
|
1788 |
-
X_units: Optional[
|
1789 |
-
y_units: Optional[
|
1790 |
) -> "PySRRegressor":
|
1791 |
"""
|
1792 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
@@ -2003,7 +2032,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2003 |
# reordered/reindexed to match those of the transformed (denoised and
|
2004 |
# feature selected) X in fit.
|
2005 |
X = X.reindex(columns=self.feature_names_in_)
|
2006 |
-
X = self.
|
2007 |
|
2008 |
try:
|
2009 |
if isinstance(best_equation, list):
|
|
|
21 |
|
22 |
import numpy as np
|
23 |
import pandas as pd
|
24 |
+
from numpy import ndarray
|
25 |
+
from numpy.typing import NDArray
|
26 |
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
|
27 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
28 |
+
from sklearn.utils.validation import _check_feature_names_in # type: ignore
|
29 |
+
from sklearn.utils.validation import check_is_fitted
|
30 |
|
31 |
from .denoising import denoise, multi_denoise
|
32 |
from .deprecated import DEPRECATED_KWARGS
|
|
|
182 |
|
183 |
|
184 |
class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
185 |
+
equations_: Optional[Union[pd.DataFrame, List[pd.DataFrame]]]
|
186 |
+
n_features_in_: int
|
187 |
+
feature_names_in_: ArrayLike[str]
|
188 |
+
display_feature_names_in_: ArrayLike[str]
|
189 |
+
X_units_: Optional[ArrayLike[str]]
|
190 |
+
y_units_: Optional[Union[str, ArrayLike[str]]]
|
191 |
+
nout_: int
|
192 |
+
selection_mask_: Optional[NDArray[np.bool_]]
|
193 |
+
tempdir_: Path
|
194 |
+
equation_file_: Union[str, Path]
|
195 |
+
julia_state_stream_: Optional[NDArray[np.uint8]]
|
196 |
+
julia_options_stream_: Optional[NDArray[np.uint8]]
|
197 |
+
equation_file_contents_: Optional[List[pd.DataFrame]]
|
198 |
+
show_pickle_warnings_: bool
|
199 |
+
|
200 |
"""
|
201 |
High-performance symbolic regression algorithm.
|
202 |
|
|
|
621 |
Units of each variable in the training dataset, `y`.
|
622 |
nout_ : int
|
623 |
Number of output dimensions.
|
624 |
+
selection_mask_ : ndarray of shape (`n_features_in_`,)
|
625 |
+
Mask of which features of `X` to use when `select_k_features` is set.
|
|
|
626 |
tempdir_ : Path
|
627 |
Path to the temporary equations directory.
|
628 |
+
equation_file_ : Union[str, Path]
|
629 |
Output equation file name produced by the julia backend.
|
630 |
julia_state_stream_ : ndarray
|
631 |
The serialized state for the julia SymbolicRegression.jl backend (after fitting),
|
632 |
stored as an array of uint8, produced by Julia's Serialization.serialize function.
|
|
|
|
|
633 |
julia_options_stream_ : ndarray
|
634 |
The serialized julia options, stored as an array of uint8,
|
|
|
|
|
635 |
equation_file_contents_ : list[pandas.DataFrame]
|
636 |
Contents of the equation file output by the Julia backend.
|
637 |
show_pickle_warnings_ : bool
|
|
|
939 |
Names of the features passed to the model.
|
940 |
Not needed if loading from a pickle file.
|
941 |
selection_mask : list[bool]
|
942 |
+
If using `select_k_features`, you must pass `model.selection_mask_` here.
|
943 |
Not needed if loading from a pickle file.
|
944 |
nout : int
|
945 |
Number of outputs of the model.
|
|
|
1137 |
|
1138 |
@property
|
1139 |
def julia_options_(self):
|
1140 |
+
"""The deserialized julia options."""
|
1141 |
return jl_deserialize(self.julia_options_stream_)
|
1142 |
|
1143 |
@property
|
1144 |
def julia_state_(self):
|
1145 |
+
"""The deserialized state."""
|
1146 |
return jl_deserialize(self.julia_state_stream_)
|
1147 |
|
1148 |
@property
|
|
|
1155 |
)
|
1156 |
return self.julia_state_
|
1157 |
|
1158 |
+
def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]:
|
1159 |
"""
|
1160 |
Get best equation using `model_selection`.
|
1161 |
|
|
|
1331 |
|
1332 |
def _validate_and_set_fit_params(
|
1333 |
self, X, y, Xresampled, weights, variable_names, X_units, y_units
|
1334 |
+
) -> Tuple[
|
1335 |
+
ndarray,
|
1336 |
+
ndarray,
|
1337 |
+
Optional[ndarray],
|
1338 |
+
Optional[ndarray],
|
1339 |
+
ndarray,
|
1340 |
+
Optional[ArrayLike[str]],
|
1341 |
+
Optional[Union[str, ArrayLike[str]]],
|
1342 |
+
]:
|
1343 |
"""
|
1344 |
Validate the parameters passed to the :term`fit` method.
|
1345 |
|
|
|
1359 |
Weight array of the same shape as `y`.
|
1360 |
Each element is how to weight the mean-square-error loss
|
1361 |
for that particular element of y.
|
1362 |
+
variable_names : ndarray of length n_features
|
1363 |
Names of each variable in the training dataset, `X`.
|
1364 |
X_units : list[str] of length n_features
|
1365 |
Units of each variable in the training dataset, `X`.
|
|
|
1415 |
if weights is not None:
|
1416 |
weights = check_array(weights, ensure_2d=False)
|
1417 |
check_consistent_length(weights, y)
|
1418 |
+
X, y = self._validate_data_X_y(X, y)
|
1419 |
self.feature_names_in_ = _safe_check_feature_names_in(
|
1420 |
self, variable_names, generate_names=False
|
1421 |
)
|
|
|
1425 |
self.display_feature_names_in_ = np.array(
|
1426 |
[f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
1427 |
)
|
1428 |
+
variable_names = self.feature_names_in_
|
1429 |
else:
|
1430 |
self.display_feature_names_in_ = self.feature_names_in_
|
1431 |
+
variable_names = self.feature_names_in_
|
|
|
1432 |
|
1433 |
# Handle multioutput data
|
1434 |
if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
|
|
|
1443 |
|
1444 |
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
1445 |
|
1446 |
+
def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
|
1447 |
+
return self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
|
1448 |
+
|
1449 |
+
def _validate_data_X(self, X) -> Tuple[ndarray]:
|
1450 |
+
return self._validate_data(X=X, reset=False) # type: ignore
|
1451 |
+
|
1452 |
def _pre_transform_training_data(
|
1453 |
self, X, y, Xresampled, variable_names, X_units, y_units, random_state
|
1454 |
):
|
|
|
1518 |
self.X_units_ = copy.deepcopy(X_units)
|
1519 |
|
1520 |
# Re-perform data validation and feature name updating
|
1521 |
+
X, y = self._validate_data_X_y(X, y)
|
1522 |
# Update feature names with selected variable names
|
1523 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
1524 |
self.display_feature_names_in_ = self.feature_names_in_
|
|
|
1535 |
|
1536 |
return X, y, variable_names, X_units, y_units
|
1537 |
|
1538 |
+
def _run(self, X, y, mutated_params, weights, seed: int):
|
1539 |
"""
|
1540 |
Run the symbolic regression fitting process on the julia backend.
|
1541 |
|
|
|
1813 |
y,
|
1814 |
Xresampled=None,
|
1815 |
weights=None,
|
1816 |
+
variable_names: Optional[ArrayLike[str]] = None,
|
1817 |
+
X_units: Optional[ArrayLike[str]] = None,
|
1818 |
+
y_units: Optional[Union[str, ArrayLike[str]]] = None,
|
1819 |
) -> "PySRRegressor":
|
1820 |
"""
|
1821 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
|
2032 |
# reordered/reindexed to match those of the transformed (denoised and
|
2033 |
# feature selected) X in fit.
|
2034 |
X = X.reindex(columns=self.feature_names_in_)
|
2035 |
+
X = self._validate_data_X(X)
|
2036 |
|
2037 |
try:
|
2038 |
if isinstance(best_equation, list):
|