MilesCranmer commited on
Commit
42005bd
1 Parent(s): 13d5805

Enable dimensional constraints

Browse files
Files changed (1) hide show
  1. pysr/sr.py +85 -8
pysr/sr.py CHANGED
@@ -167,6 +167,8 @@ def _check_assertions(
167
  variable_names,
168
  weights,
169
  y,
 
 
170
  ):
171
  # Check for potential errors before they happen
172
  assert len(X.shape) == 2
@@ -190,6 +192,24 @@ def _check_assertions(
190
  "Only alphanumeric characters, numbers, "
191
  "and underscores are allowed."
192
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
 
195
  def best(*args, **kwargs): # pragma: no cover
@@ -635,6 +655,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
635
  has feature names that are all strings.
636
  pretty_feature_names_in_ : ndarray of shape (`n_features_in_`,)
637
  Pretty names of features, used only during printing.
 
 
 
 
638
  nout_ : int
639
  Number of output dimensions.
640
  selection_mask_ : list[int] of length `select_k_features`
@@ -1324,7 +1348,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1324
 
1325
  return packed_modified_params
1326
 
1327
- def _validate_and_set_fit_params(self, X, y, Xresampled, weights, variable_names):
 
 
1328
  """
1329
  Validate the parameters passed to the :term`fit` method.
1330
 
@@ -1346,6 +1372,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1346
  for that particular element of y.
1347
  variable_names : list[str] of length n_features
1348
  Names of each variable in the training dataset, `X`.
 
 
 
 
1349
 
1350
  Returns
1351
  -------
@@ -1357,6 +1387,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1357
  Validated resampled training data used for denoising.
1358
  variable_names_validated : list[str] of length n_features
1359
  Validated list of variable names for each feature in `X`.
 
 
 
 
1360
 
1361
  """
1362
  if isinstance(X, pd.DataFrame):
@@ -1415,10 +1449,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1415
  else:
1416
  raise NotImplementedError("y shape not supported!")
1417
 
1418
- return X, y, Xresampled, weights, variable_names
 
 
 
1419
 
1420
  def _pre_transform_training_data(
1421
- self, X, y, Xresampled, variable_names, random_state
1422
  ):
1423
  """
1424
  Transform the training data before fitting the symbolic regressor.
@@ -1438,6 +1475,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1438
  variable_names : list[str]
1439
  Names of each variable in the training dataset, `X`.
1440
  Of length `n_features`.
 
 
 
 
1441
  random_state : int | np.RandomState
1442
  Pass an int for reproducible results across multiple function calls.
1443
  See :term:`Glossary <random_state>`. Default is `None`.
@@ -1459,6 +1500,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1459
  variable_names_transformed : list[str] of length n_features
1460
  Names of each variable in the transformed dataset,
1461
  `X_transformed`.
 
 
 
 
1462
  """
1463
  # Feature selection transformation
1464
  if self.select_k_features:
@@ -1473,6 +1518,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1473
  # Reduce variable_names to selection
1474
  variable_names = [variable_names[i] for i in self.selection_mask_]
1475
 
 
 
 
 
 
 
 
1476
  # Re-perform data validation and feature name updating
1477
  X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
1478
  # Update feature names with selected variable names
@@ -1497,7 +1549,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1497
  else:
1498
  X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
1499
 
1500
- return X, y, variable_names
1501
 
1502
  def _run(self, X, y, mutated_params, weights, seed):
1503
  """
@@ -1733,6 +1785,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1733
  and self.pretty_feature_names_in_ is not None
1734
  else self.feature_names_in_.tolist()
1735
  ),
 
 
1736
  options=options,
1737
  numprocs=cprocs,
1738
  parallelism=parallelism,
@@ -1758,6 +1812,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1758
  Xresampled=None,
1759
  weights=None,
1760
  variable_names=None,
 
 
1761
  ):
1762
  """
1763
  Search for equations to fit the dataset and store them in `self.equations_`.
@@ -1785,6 +1841,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1785
  instead of `variable_names`. Cannot contain spaces or special
1786
  characters. Avoid variable names which are also
1787
  function names in `sympy`, such as "N".
 
 
 
 
 
 
 
 
 
1788
 
1789
  Returns
1790
  -------
@@ -1806,6 +1871,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1806
  self.nout_ = 1
1807
  self.selection_mask_ = None
1808
  self.raw_julia_state_ = None
 
 
1809
 
1810
  random_state = check_random_state(self.random_state) # For np random
1811
  seed = random_state.get_state()[1][0] # For julia random
@@ -1814,8 +1881,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1814
 
1815
  mutated_params = self._validate_and_set_init_params()
1816
 
1817
- X, y, Xresampled, weights, variable_names = self._validate_and_set_fit_params(
1818
- X, y, Xresampled, weights, variable_names
 
 
 
 
 
 
 
 
1819
  )
1820
 
1821
  if X.shape[0] > 10000 and not self.batching:
@@ -1830,8 +1905,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1830
  )
1831
 
1832
  # Pre transformations (feature selection and denoising)
1833
- X, y, variable_names = self._pre_transform_training_data(
1834
- X, y, Xresampled, variable_names, random_state
1835
  )
1836
 
1837
  # Warn about large feature counts (still warn if feature count is large
@@ -1860,6 +1935,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1860
  variable_names,
1861
  weights,
1862
  y,
 
 
1863
  )
1864
 
1865
  # Initially, just save model parameters, so that
 
167
  variable_names,
168
  weights,
169
  y,
170
+ X_units,
171
+ y_units,
172
  ):
173
  # Check for potential errors before they happen
174
  assert len(X.shape) == 2
 
192
  "Only alphanumeric characters, numbers, "
193
  "and underscores are allowed."
194
  )
195
+ if X_units is not None and len(X_units) != X.shape[1]:
196
+ raise ValueError(
197
+ "The number of units in `X_units` must equal the number of features in `X`."
198
+ )
199
+ if y_units is not None:
200
+ good_y_units = False
201
+ if isinstance(y_units, list):
202
+ if len(y.shape) == 1:
203
+ good_y_units = len(y_units) == 1
204
+ else:
205
+ good_y_units = len(y_units) == y.shape[1]
206
+ else:
207
+ good_y_units = len(y.shape) == 1 or y.shape[1] == 1
208
+
209
+ if not good_y_units:
210
+ raise ValueError(
211
+ "The number of units in `y_units` must equal the number of output features in `y`."
212
+ )
213
 
214
 
215
  def best(*args, **kwargs): # pragma: no cover
 
655
  has feature names that are all strings.
656
  pretty_feature_names_in_ : ndarray of shape (`n_features_in_`,)
657
  Pretty names of features, used only during printing.
658
+ X_units_ : list[str] of length n_features
659
+ Units of each variable in the training dataset, `X`.
660
+ y_units_ : str | list[str] of length n_out
661
+ Units of each variable in the training dataset, `y`.
662
  nout_ : int
663
  Number of output dimensions.
664
  selection_mask_ : list[int] of length `select_k_features`
 
1348
 
1349
  return packed_modified_params
1350
 
1351
+ def _validate_and_set_fit_params(
1352
+ self, X, y, Xresampled, weights, variable_names, X_units, y_units
1353
+ ):
1354
  """
1355
  Validate the parameters passed to the :term`fit` method.
1356
 
 
1372
  for that particular element of y.
1373
  variable_names : list[str] of length n_features
1374
  Names of each variable in the training dataset, `X`.
1375
+ X_units : list[str] of length n_features
1376
+ Units of each variable in the training dataset, `X`.
1377
+ y_units : str | list[str] of length n_out
1378
+ Units of each variable in the training dataset, `y`.
1379
 
1380
  Returns
1381
  -------
 
1387
  Validated resampled training data used for denoising.
1388
  variable_names_validated : list[str] of length n_features
1389
  Validated list of variable names for each feature in `X`.
1390
+ X_units : list[str] of length n_features
1391
+ Validated units for `X`.
1392
+ y_units : str | list[str] of length n_out
1393
+ Validated units for `y`.
1394
 
1395
  """
1396
  if isinstance(X, pd.DataFrame):
 
1449
  else:
1450
  raise NotImplementedError("y shape not supported!")
1451
 
1452
+ self.X_units_ = copy.deepcopy(X_units)
1453
+ self.y_units_ = copy.deepcopy(y_units)
1454
+
1455
+ return X, y, Xresampled, weights, variable_names, X_units, y_units
1456
 
1457
  def _pre_transform_training_data(
1458
+ self, X, y, Xresampled, variable_names, X_units, y_units, random_state
1459
  ):
1460
  """
1461
  Transform the training data before fitting the symbolic regressor.
 
1475
  variable_names : list[str]
1476
  Names of each variable in the training dataset, `X`.
1477
  Of length `n_features`.
1478
+ X_units : list[str]
1479
+ Units of each variable in the training dataset, `X`.
1480
+ y_units : str | list[str]
1481
+ Units of each variable in the training dataset, `y`.
1482
  random_state : int | np.RandomState
1483
  Pass an int for reproducible results across multiple function calls.
1484
  See :term:`Glossary <random_state>`. Default is `None`.
 
1500
  variable_names_transformed : list[str] of length n_features
1501
  Names of each variable in the transformed dataset,
1502
  `X_transformed`.
1503
+ X_units_transformed : list[str] of length n_features
1504
+ Units of each variable in the transformed dataset.
1505
+ y_units_transformed : str | list[str] of length n_out
1506
+ Units of each variable in the transformed dataset.
1507
  """
1508
  # Feature selection transformation
1509
  if self.select_k_features:
 
1518
  # Reduce variable_names to selection
1519
  variable_names = [variable_names[i] for i in self.selection_mask_]
1520
 
1521
+ if X_units is not None:
1522
+ X_units = [X_units[i] for i in self.selection_mask_]
1523
+ self.X_units_ = copy.deepcopy(X_units)
1524
+ if y_units is not None:
1525
+ y_units = [y_units[i] for i in self.selection_mask_]
1526
+ self.y_units_ = copy.deepcopy(y_units)
1527
+
1528
  # Re-perform data validation and feature name updating
1529
  X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
1530
  # Update feature names with selected variable names
 
1549
  else:
1550
  X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
1551
 
1552
+ return X, y, variable_names, X_units, y_units
1553
 
1554
  def _run(self, X, y, mutated_params, weights, seed):
1555
  """
 
1785
  and self.pretty_feature_names_in_ is not None
1786
  else self.feature_names_in_.tolist()
1787
  ),
1788
+ X_units=self.X_units_,
1789
+ y_units=self.y_units_,
1790
  options=options,
1791
  numprocs=cprocs,
1792
  parallelism=parallelism,
 
1812
  Xresampled=None,
1813
  weights=None,
1814
  variable_names=None,
1815
+ X_units=None,
1816
+ y_units=None,
1817
  ):
1818
  """
1819
  Search for equations to fit the dataset and store them in `self.equations_`.
 
1841
  instead of `variable_names`. Cannot contain spaces or special
1842
  characters. Avoid variable names which are also
1843
  function names in `sympy`, such as "N".
1844
+ X_units : list[str]
1845
+ A list of units for each variable in `X`. Each unit should be
1846
+ a string representing a Julia expression. See DynamicQuantities.jl
1847
+ https://symbolicml.org/DynamicQuantities.jl/dev/units/ for more
1848
+ information.
1849
+ y_units : str | list[str]
1850
+ Similar to `X_units`, but as a unit for the target variable, `y`.
1851
+ If `y` is a matrix, a list of units should be passed. If `X_units`
1852
+ is given but `y_units` is not, then `y_units` will be arbitrary.
1853
 
1854
  Returns
1855
  -------
 
1871
  self.nout_ = 1
1872
  self.selection_mask_ = None
1873
  self.raw_julia_state_ = None
1874
+ self.X_units_ = None
1875
+ self.y_units_ = None
1876
 
1877
  random_state = check_random_state(self.random_state) # For np random
1878
  seed = random_state.get_state()[1][0] # For julia random
 
1881
 
1882
  mutated_params = self._validate_and_set_init_params()
1883
 
1884
+ (
1885
+ X,
1886
+ y,
1887
+ Xresampled,
1888
+ weights,
1889
+ variable_names,
1890
+ X_units,
1891
+ y_units,
1892
+ ) = self._validate_and_set_fit_params(
1893
+ X, y, Xresampled, weights, variable_names, X_units, y_units
1894
  )
1895
 
1896
  if X.shape[0] > 10000 and not self.batching:
 
1905
  )
1906
 
1907
  # Pre transformations (feature selection and denoising)
1908
+ X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
1909
+ X, y, Xresampled, variable_names, X_units, y_units, random_state
1910
  )
1911
 
1912
  # Warn about large feature counts (still warn if feature count is large
 
1935
  variable_names,
1936
  weights,
1937
  y,
1938
+ X_units,
1939
+ y_units,
1940
  )
1941
 
1942
  # Initially, just save model parameters, so that