Spaces:

MilesCranmer
/

PySR

Sleeping

App Files Files Community

MilesCranmer commited on Jul 22, 2023

Commit

42005bd

•

1 Parent(s): 13d5805

Enable dimensional constraints

Browse files

Files changed (1) hide show

pysr/sr.py +85 -8

pysr/sr.py CHANGED Viewed

@@ -167,6 +167,8 @@ def _check_assertions(
     variable_names,
     weights,
     y,
 ):
     # Check for potential errors before they happen
     assert len(X.shape) == 2
@@ -190,6 +192,24 @@ def _check_assertions(
                     "Only alphanumeric characters, numbers, "
                     "and underscores are allowed."
                 )
 def best(*args, **kwargs):  # pragma: no cover
@@ -635,6 +655,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         has feature names that are all strings.
     pretty_feature_names_in_ : ndarray of shape (`n_features_in_`,)
         Pretty names of features, used only during printing.
     nout_ : int
         Number of output dimensions.
     selection_mask_ : list[int] of length `select_k_features`
@@ -1324,7 +1348,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         return packed_modified_params
-    def _validate_and_set_fit_params(self, X, y, Xresampled, weights, variable_names):
         """
         Validate the parameters passed to the :term`fit` method.
@@ -1346,6 +1372,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             for that particular element of y.
         variable_names : list[str] of length n_features
             Names of each variable in the training dataset, `X`.
         Returns
         -------
@@ -1357,6 +1387,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             Validated resampled training data used for denoising.
         variable_names_validated : list[str] of length n_features
             Validated list of variable names for each feature in `X`.
         """
         if isinstance(X, pd.DataFrame):
@@ -1415,10 +1449,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         else:
             raise NotImplementedError("y shape not supported!")
-        return X, y, Xresampled, weights, variable_names
     def _pre_transform_training_data(
-        self, X, y, Xresampled, variable_names, random_state
     ):
         """
         Transform the training data before fitting the symbolic regressor.
@@ -1438,6 +1475,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         variable_names : list[str]
             Names of each variable in the training dataset, `X`.
             Of length `n_features`.
         random_state : int | np.RandomState
             Pass an int for reproducible results across multiple function calls.
             See :term:`Glossary <random_state>`. Default is `None`.
@@ -1459,6 +1500,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         variable_names_transformed : list[str] of length n_features
             Names of each variable in the transformed dataset,
             `X_transformed`.
         """
         # Feature selection transformation
         if self.select_k_features:
@@ -1473,6 +1518,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             # Reduce variable_names to selection
             variable_names = [variable_names[i] for i in self.selection_mask_]
             # Re-perform data validation and feature name updating
             X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
             # Update feature names with selected variable names
@@ -1497,7 +1549,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             else:
                 X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
-        return X, y, variable_names
     def _run(self, X, y, mutated_params, weights, seed):
         """
@@ -1733,6 +1785,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                 and self.pretty_feature_names_in_ is not None
                 else self.feature_names_in_.tolist()
             ),
             options=options,
             numprocs=cprocs,
             parallelism=parallelism,
@@ -1758,6 +1812,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         Xresampled=None,
         weights=None,
         variable_names=None,
     ):
         """
         Search for equations to fit the dataset and store them in `self.equations_`.
@@ -1785,6 +1841,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             instead of `variable_names`. Cannot contain spaces or special
             characters. Avoid variable names which are also
             function names in `sympy`, such as "N".
         Returns
         -------
@@ -1806,6 +1871,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             self.nout_ = 1
             self.selection_mask_ = None
             self.raw_julia_state_ = None
         random_state = check_random_state(self.random_state)  # For np random
         seed = random_state.get_state()[1][0]  # For julia random
@@ -1814,8 +1881,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         mutated_params = self._validate_and_set_init_params()
-        X, y, Xresampled, weights, variable_names = self._validate_and_set_fit_params(
-            X, y, Xresampled, weights, variable_names
         )
         if X.shape[0] > 10000 and not self.batching:
@@ -1830,8 +1905,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             )
         # Pre transformations (feature selection and denoising)
-        X, y, variable_names = self._pre_transform_training_data(
-            X, y, Xresampled, variable_names, random_state
         )
         # Warn about large feature counts (still warn if feature count is large
@@ -1860,6 +1935,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             variable_names,
             weights,
             y,
         )
         # Initially, just save model parameters, so that

     variable_names,
     weights,
     y,
+    X_units,
+    y_units,
 ):
     # Check for potential errors before they happen
     assert len(X.shape) == 2
                     "Only alphanumeric characters, numbers, "
                     "and underscores are allowed."
                 )
+    if X_units is not None and len(X_units) != X.shape[1]:
+        raise ValueError(
+            "The number of units in `X_units` must equal the number of features in `X`."
+        )
+    if y_units is not None:
+        good_y_units = False
+        if isinstance(y_units, list):
+            if len(y.shape) == 1:
+                good_y_units = len(y_units) == 1
+            else:
+                good_y_units = len(y_units) == y.shape[1]
+        else:
+            good_y_units = len(y.shape) == 1 or y.shape[1] == 1
+        if not good_y_units:
+            raise ValueError(
+                "The number of units in `y_units` must equal the number of output features in `y`."
+            )
 def best(*args, **kwargs):  # pragma: no cover
         has feature names that are all strings.
     pretty_feature_names_in_ : ndarray of shape (`n_features_in_`,)
         Pretty names of features, used only during printing.
+    X_units_ : list[str] of length n_features
+        Units of each variable in the training dataset, `X`.
+    y_units_ : str | list[str] of length n_out
+        Units of each variable in the training dataset, `y`.
     nout_ : int
         Number of output dimensions.
     selection_mask_ : list[int] of length `select_k_features`
         return packed_modified_params
+    def _validate_and_set_fit_params(
+        self, X, y, Xresampled, weights, variable_names, X_units, y_units
+    ):
         """
         Validate the parameters passed to the :term`fit` method.
             for that particular element of y.
         variable_names : list[str] of length n_features
             Names of each variable in the training dataset, `X`.
+        X_units : list[str] of length n_features
+            Units of each variable in the training dataset, `X`.
+        y_units : str | list[str] of length n_out
+            Units of each variable in the training dataset, `y`.
         Returns
         -------
             Validated resampled training data used for denoising.
         variable_names_validated : list[str] of length n_features
             Validated list of variable names for each feature in `X`.
+        X_units : list[str] of length n_features
+            Validated units for `X`.
+        y_units : str | list[str] of length n_out
+            Validated units for `y`.
         """
         if isinstance(X, pd.DataFrame):
         else:
             raise NotImplementedError("y shape not supported!")
+        self.X_units_ = copy.deepcopy(X_units)
+        self.y_units_ = copy.deepcopy(y_units)
+        return X, y, Xresampled, weights, variable_names, X_units, y_units
     def _pre_transform_training_data(
+        self, X, y, Xresampled, variable_names, X_units, y_units, random_state
     ):
         """
         Transform the training data before fitting the symbolic regressor.
         variable_names : list[str]
             Names of each variable in the training dataset, `X`.
             Of length `n_features`.
+        X_units : list[str]
+            Units of each variable in the training dataset, `X`.
+        y_units : str | list[str]
+            Units of each variable in the training dataset, `y`.
         random_state : int | np.RandomState
             Pass an int for reproducible results across multiple function calls.
             See :term:`Glossary <random_state>`. Default is `None`.
         variable_names_transformed : list[str] of length n_features
             Names of each variable in the transformed dataset,
             `X_transformed`.
+        X_units_transformed : list[str] of length n_features
+            Units of each variable in the transformed dataset.
+        y_units_transformed : str | list[str] of length n_out
+            Units of each variable in the transformed dataset.
         """
         # Feature selection transformation
         if self.select_k_features:
             # Reduce variable_names to selection
             variable_names = [variable_names[i] for i in self.selection_mask_]
+            if X_units is not None:
+                X_units = [X_units[i] for i in self.selection_mask_]
+                self.X_units_ = copy.deepcopy(X_units)
+            if y_units is not None:
+                y_units = [y_units[i] for i in self.selection_mask_]
+                self.y_units_ = copy.deepcopy(y_units)
             # Re-perform data validation and feature name updating
             X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
             # Update feature names with selected variable names
             else:
                 X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
+        return X, y, variable_names, X_units, y_units
     def _run(self, X, y, mutated_params, weights, seed):
         """
                 and self.pretty_feature_names_in_ is not None
                 else self.feature_names_in_.tolist()
             ),
+            X_units=self.X_units_,
+            y_units=self.y_units_,
             options=options,
             numprocs=cprocs,
             parallelism=parallelism,
         Xresampled=None,
         weights=None,
         variable_names=None,
+        X_units=None,
+        y_units=None,
     ):
         """
         Search for equations to fit the dataset and store them in `self.equations_`.
             instead of `variable_names`. Cannot contain spaces or special
             characters. Avoid variable names which are also
             function names in `sympy`, such as "N".
+        X_units : list[str]
+            A list of units for each variable in `X`. Each unit should be
+            a string representing a Julia expression. See DynamicQuantities.jl
+            https://symbolicml.org/DynamicQuantities.jl/dev/units/ for more
+            information.
+        y_units : str | list[str]
+            Similar to `X_units`, but as a unit for the target variable, `y`.
+            If `y` is a matrix, a list of units should be passed. If `X_units`
+            is given but `y_units` is not, then `y_units` will be arbitrary.
         Returns
         -------
             self.nout_ = 1
             self.selection_mask_ = None
             self.raw_julia_state_ = None
+            self.X_units_ = None
+            self.y_units_ = None
         random_state = check_random_state(self.random_state)  # For np random
         seed = random_state.get_state()[1][0]  # For julia random
         mutated_params = self._validate_and_set_init_params()
+        (
+            X,
+            y,
+            Xresampled,
+            weights,
+            variable_names,
+            X_units,
+            y_units,
+        ) = self._validate_and_set_fit_params(
+            X, y, Xresampled, weights, variable_names, X_units, y_units
         )
         if X.shape[0] > 10000 and not self.batching:
             )
         # Pre transformations (feature selection and denoising)
+        X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
+            X, y, Xresampled, variable_names, X_units, y_units, random_state
         )
         # Warn about large feature counts (still warn if feature count is large
             variable_names,
             weights,
             y,
+            X_units,
+            y_units,
         )
         # Initially, just save model parameters, so that