Spaces:
Sleeping
Sleeping
MilesCranmer
commited on
Commit
•
42005bd
1
Parent(s):
13d5805
Enable dimensional constraints
Browse files- pysr/sr.py +85 -8
pysr/sr.py
CHANGED
@@ -167,6 +167,8 @@ def _check_assertions(
|
|
167 |
variable_names,
|
168 |
weights,
|
169 |
y,
|
|
|
|
|
170 |
):
|
171 |
# Check for potential errors before they happen
|
172 |
assert len(X.shape) == 2
|
@@ -190,6 +192,24 @@ def _check_assertions(
|
|
190 |
"Only alphanumeric characters, numbers, "
|
191 |
"and underscores are allowed."
|
192 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
|
195 |
def best(*args, **kwargs): # pragma: no cover
|
@@ -635,6 +655,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
635 |
has feature names that are all strings.
|
636 |
pretty_feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
637 |
Pretty names of features, used only during printing.
|
|
|
|
|
|
|
|
|
638 |
nout_ : int
|
639 |
Number of output dimensions.
|
640 |
selection_mask_ : list[int] of length `select_k_features`
|
@@ -1324,7 +1348,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1324 |
|
1325 |
return packed_modified_params
|
1326 |
|
1327 |
-
def _validate_and_set_fit_params(
|
|
|
|
|
1328 |
"""
|
1329 |
Validate the parameters passed to the :term`fit` method.
|
1330 |
|
@@ -1346,6 +1372,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1346 |
for that particular element of y.
|
1347 |
variable_names : list[str] of length n_features
|
1348 |
Names of each variable in the training dataset, `X`.
|
|
|
|
|
|
|
|
|
1349 |
|
1350 |
Returns
|
1351 |
-------
|
@@ -1357,6 +1387,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1357 |
Validated resampled training data used for denoising.
|
1358 |
variable_names_validated : list[str] of length n_features
|
1359 |
Validated list of variable names for each feature in `X`.
|
|
|
|
|
|
|
|
|
1360 |
|
1361 |
"""
|
1362 |
if isinstance(X, pd.DataFrame):
|
@@ -1415,10 +1449,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1415 |
else:
|
1416 |
raise NotImplementedError("y shape not supported!")
|
1417 |
|
1418 |
-
|
|
|
|
|
|
|
1419 |
|
1420 |
def _pre_transform_training_data(
|
1421 |
-
self, X, y, Xresampled, variable_names, random_state
|
1422 |
):
|
1423 |
"""
|
1424 |
Transform the training data before fitting the symbolic regressor.
|
@@ -1438,6 +1475,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1438 |
variable_names : list[str]
|
1439 |
Names of each variable in the training dataset, `X`.
|
1440 |
Of length `n_features`.
|
|
|
|
|
|
|
|
|
1441 |
random_state : int | np.RandomState
|
1442 |
Pass an int for reproducible results across multiple function calls.
|
1443 |
See :term:`Glossary <random_state>`. Default is `None`.
|
@@ -1459,6 +1500,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1459 |
variable_names_transformed : list[str] of length n_features
|
1460 |
Names of each variable in the transformed dataset,
|
1461 |
`X_transformed`.
|
|
|
|
|
|
|
|
|
1462 |
"""
|
1463 |
# Feature selection transformation
|
1464 |
if self.select_k_features:
|
@@ -1473,6 +1518,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1473 |
# Reduce variable_names to selection
|
1474 |
variable_names = [variable_names[i] for i in self.selection_mask_]
|
1475 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1476 |
# Re-perform data validation and feature name updating
|
1477 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
1478 |
# Update feature names with selected variable names
|
@@ -1497,7 +1549,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1497 |
else:
|
1498 |
X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
1499 |
|
1500 |
-
return X, y, variable_names
|
1501 |
|
1502 |
def _run(self, X, y, mutated_params, weights, seed):
|
1503 |
"""
|
@@ -1733,6 +1785,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1733 |
and self.pretty_feature_names_in_ is not None
|
1734 |
else self.feature_names_in_.tolist()
|
1735 |
),
|
|
|
|
|
1736 |
options=options,
|
1737 |
numprocs=cprocs,
|
1738 |
parallelism=parallelism,
|
@@ -1758,6 +1812,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1758 |
Xresampled=None,
|
1759 |
weights=None,
|
1760 |
variable_names=None,
|
|
|
|
|
1761 |
):
|
1762 |
"""
|
1763 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
@@ -1785,6 +1841,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1785 |
instead of `variable_names`. Cannot contain spaces or special
|
1786 |
characters. Avoid variable names which are also
|
1787 |
function names in `sympy`, such as "N".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1788 |
|
1789 |
Returns
|
1790 |
-------
|
@@ -1806,6 +1871,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1806 |
self.nout_ = 1
|
1807 |
self.selection_mask_ = None
|
1808 |
self.raw_julia_state_ = None
|
|
|
|
|
1809 |
|
1810 |
random_state = check_random_state(self.random_state) # For np random
|
1811 |
seed = random_state.get_state()[1][0] # For julia random
|
@@ -1814,8 +1881,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1814 |
|
1815 |
mutated_params = self._validate_and_set_init_params()
|
1816 |
|
1817 |
-
|
1818 |
-
X,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1819 |
)
|
1820 |
|
1821 |
if X.shape[0] > 10000 and not self.batching:
|
@@ -1830,8 +1905,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1830 |
)
|
1831 |
|
1832 |
# Pre transformations (feature selection and denoising)
|
1833 |
-
X, y, variable_names = self._pre_transform_training_data(
|
1834 |
-
X, y, Xresampled, variable_names, random_state
|
1835 |
)
|
1836 |
|
1837 |
# Warn about large feature counts (still warn if feature count is large
|
@@ -1860,6 +1935,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1860 |
variable_names,
|
1861 |
weights,
|
1862 |
y,
|
|
|
|
|
1863 |
)
|
1864 |
|
1865 |
# Initially, just save model parameters, so that
|
|
|
167 |
variable_names,
|
168 |
weights,
|
169 |
y,
|
170 |
+
X_units,
|
171 |
+
y_units,
|
172 |
):
|
173 |
# Check for potential errors before they happen
|
174 |
assert len(X.shape) == 2
|
|
|
192 |
"Only alphanumeric characters, numbers, "
|
193 |
"and underscores are allowed."
|
194 |
)
|
195 |
+
if X_units is not None and len(X_units) != X.shape[1]:
|
196 |
+
raise ValueError(
|
197 |
+
"The number of units in `X_units` must equal the number of features in `X`."
|
198 |
+
)
|
199 |
+
if y_units is not None:
|
200 |
+
good_y_units = False
|
201 |
+
if isinstance(y_units, list):
|
202 |
+
if len(y.shape) == 1:
|
203 |
+
good_y_units = len(y_units) == 1
|
204 |
+
else:
|
205 |
+
good_y_units = len(y_units) == y.shape[1]
|
206 |
+
else:
|
207 |
+
good_y_units = len(y.shape) == 1 or y.shape[1] == 1
|
208 |
+
|
209 |
+
if not good_y_units:
|
210 |
+
raise ValueError(
|
211 |
+
"The number of units in `y_units` must equal the number of output features in `y`."
|
212 |
+
)
|
213 |
|
214 |
|
215 |
def best(*args, **kwargs): # pragma: no cover
|
|
|
655 |
has feature names that are all strings.
|
656 |
pretty_feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
657 |
Pretty names of features, used only during printing.
|
658 |
+
X_units_ : list[str] of length n_features
|
659 |
+
Units of each variable in the training dataset, `X`.
|
660 |
+
y_units_ : str | list[str] of length n_out
|
661 |
+
Units of each variable in the training dataset, `y`.
|
662 |
nout_ : int
|
663 |
Number of output dimensions.
|
664 |
selection_mask_ : list[int] of length `select_k_features`
|
|
|
1348 |
|
1349 |
return packed_modified_params
|
1350 |
|
1351 |
+
def _validate_and_set_fit_params(
|
1352 |
+
self, X, y, Xresampled, weights, variable_names, X_units, y_units
|
1353 |
+
):
|
1354 |
"""
|
1355 |
Validate the parameters passed to the :term`fit` method.
|
1356 |
|
|
|
1372 |
for that particular element of y.
|
1373 |
variable_names : list[str] of length n_features
|
1374 |
Names of each variable in the training dataset, `X`.
|
1375 |
+
X_units : list[str] of length n_features
|
1376 |
+
Units of each variable in the training dataset, `X`.
|
1377 |
+
y_units : str | list[str] of length n_out
|
1378 |
+
Units of each variable in the training dataset, `y`.
|
1379 |
|
1380 |
Returns
|
1381 |
-------
|
|
|
1387 |
Validated resampled training data used for denoising.
|
1388 |
variable_names_validated : list[str] of length n_features
|
1389 |
Validated list of variable names for each feature in `X`.
|
1390 |
+
X_units : list[str] of length n_features
|
1391 |
+
Validated units for `X`.
|
1392 |
+
y_units : str | list[str] of length n_out
|
1393 |
+
Validated units for `y`.
|
1394 |
|
1395 |
"""
|
1396 |
if isinstance(X, pd.DataFrame):
|
|
|
1449 |
else:
|
1450 |
raise NotImplementedError("y shape not supported!")
|
1451 |
|
1452 |
+
self.X_units_ = copy.deepcopy(X_units)
|
1453 |
+
self.y_units_ = copy.deepcopy(y_units)
|
1454 |
+
|
1455 |
+
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
1456 |
|
1457 |
def _pre_transform_training_data(
|
1458 |
+
self, X, y, Xresampled, variable_names, X_units, y_units, random_state
|
1459 |
):
|
1460 |
"""
|
1461 |
Transform the training data before fitting the symbolic regressor.
|
|
|
1475 |
variable_names : list[str]
|
1476 |
Names of each variable in the training dataset, `X`.
|
1477 |
Of length `n_features`.
|
1478 |
+
X_units : list[str]
|
1479 |
+
Units of each variable in the training dataset, `X`.
|
1480 |
+
y_units : str | list[str]
|
1481 |
+
Units of each variable in the training dataset, `y`.
|
1482 |
random_state : int | np.RandomState
|
1483 |
Pass an int for reproducible results across multiple function calls.
|
1484 |
See :term:`Glossary <random_state>`. Default is `None`.
|
|
|
1500 |
variable_names_transformed : list[str] of length n_features
|
1501 |
Names of each variable in the transformed dataset,
|
1502 |
`X_transformed`.
|
1503 |
+
X_units_transformed : list[str] of length n_features
|
1504 |
+
Units of each variable in the transformed dataset.
|
1505 |
+
y_units_transformed : str | list[str] of length n_out
|
1506 |
+
Units of each variable in the transformed dataset.
|
1507 |
"""
|
1508 |
# Feature selection transformation
|
1509 |
if self.select_k_features:
|
|
|
1518 |
# Reduce variable_names to selection
|
1519 |
variable_names = [variable_names[i] for i in self.selection_mask_]
|
1520 |
|
1521 |
+
if X_units is not None:
|
1522 |
+
X_units = [X_units[i] for i in self.selection_mask_]
|
1523 |
+
self.X_units_ = copy.deepcopy(X_units)
|
1524 |
+
if y_units is not None:
|
1525 |
+
y_units = [y_units[i] for i in self.selection_mask_]
|
1526 |
+
self.y_units_ = copy.deepcopy(y_units)
|
1527 |
+
|
1528 |
# Re-perform data validation and feature name updating
|
1529 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
1530 |
# Update feature names with selected variable names
|
|
|
1549 |
else:
|
1550 |
X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
1551 |
|
1552 |
+
return X, y, variable_names, X_units, y_units
|
1553 |
|
1554 |
def _run(self, X, y, mutated_params, weights, seed):
|
1555 |
"""
|
|
|
1785 |
and self.pretty_feature_names_in_ is not None
|
1786 |
else self.feature_names_in_.tolist()
|
1787 |
),
|
1788 |
+
X_units=self.X_units_,
|
1789 |
+
y_units=self.y_units_,
|
1790 |
options=options,
|
1791 |
numprocs=cprocs,
|
1792 |
parallelism=parallelism,
|
|
|
1812 |
Xresampled=None,
|
1813 |
weights=None,
|
1814 |
variable_names=None,
|
1815 |
+
X_units=None,
|
1816 |
+
y_units=None,
|
1817 |
):
|
1818 |
"""
|
1819 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
|
1841 |
instead of `variable_names`. Cannot contain spaces or special
|
1842 |
characters. Avoid variable names which are also
|
1843 |
function names in `sympy`, such as "N".
|
1844 |
+
X_units : list[str]
|
1845 |
+
A list of units for each variable in `X`. Each unit should be
|
1846 |
+
a string representing a Julia expression. See DynamicQuantities.jl
|
1847 |
+
https://symbolicml.org/DynamicQuantities.jl/dev/units/ for more
|
1848 |
+
information.
|
1849 |
+
y_units : str | list[str]
|
1850 |
+
Similar to `X_units`, but as a unit for the target variable, `y`.
|
1851 |
+
If `y` is a matrix, a list of units should be passed. If `X_units`
|
1852 |
+
is given but `y_units` is not, then `y_units` will be arbitrary.
|
1853 |
|
1854 |
Returns
|
1855 |
-------
|
|
|
1871 |
self.nout_ = 1
|
1872 |
self.selection_mask_ = None
|
1873 |
self.raw_julia_state_ = None
|
1874 |
+
self.X_units_ = None
|
1875 |
+
self.y_units_ = None
|
1876 |
|
1877 |
random_state = check_random_state(self.random_state) # For np random
|
1878 |
seed = random_state.get_state()[1][0] # For julia random
|
|
|
1881 |
|
1882 |
mutated_params = self._validate_and_set_init_params()
|
1883 |
|
1884 |
+
(
|
1885 |
+
X,
|
1886 |
+
y,
|
1887 |
+
Xresampled,
|
1888 |
+
weights,
|
1889 |
+
variable_names,
|
1890 |
+
X_units,
|
1891 |
+
y_units,
|
1892 |
+
) = self._validate_and_set_fit_params(
|
1893 |
+
X, y, Xresampled, weights, variable_names, X_units, y_units
|
1894 |
)
|
1895 |
|
1896 |
if X.shape[0] > 10000 and not self.batching:
|
|
|
1905 |
)
|
1906 |
|
1907 |
# Pre transformations (feature selection and denoising)
|
1908 |
+
X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
|
1909 |
+
X, y, Xresampled, variable_names, X_units, y_units, random_state
|
1910 |
)
|
1911 |
|
1912 |
# Warn about large feature counts (still warn if feature count is large
|
|
|
1935 |
variable_names,
|
1936 |
weights,
|
1937 |
y,
|
1938 |
+
X_units,
|
1939 |
+
y_units,
|
1940 |
)
|
1941 |
|
1942 |
# Initially, just save model parameters, so that
|