MilesCranmer commited on
Commit
530ae99
·
unverified ·
1 Parent(s): 505af8d

refactor: runtime parameters into dataclass

Browse files
Files changed (2) hide show
  1. pysr/sr.py +110 -82
  2. pysr/utils.py +3 -1
pysr/sr.py CHANGED
@@ -8,6 +8,7 @@ import shutil
8
  import sys
9
  import tempfile
10
  import warnings
 
11
  from datetime import datetime
12
  from io import StringIO
13
  from multiprocessing import cpu_count
@@ -48,6 +49,7 @@ from .julia_helpers import (
48
  from .julia_import import SymbolicRegression, jl
49
  from .utils import (
50
  ArrayLike,
 
51
  _csv_filename_to_pkl_filename,
52
  _preprocess_julia_floats,
53
  _safe_check_feature_names_in,
@@ -182,6 +184,21 @@ def _check_assertions(
182
  VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
183
 
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
186
  """
187
  High-performance symbolic regression algorithm.
@@ -676,7 +693,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
676
  nout_: int
677
  selection_mask_: Union[NDArray[np.bool_], None]
678
  tempdir_: Path
679
- equation_file_: Union[str, Path]
680
  julia_state_stream_: Union[NDArray[np.uint8], None]
681
  julia_options_stream_: Union[NDArray[np.uint8], None]
682
  equation_file_contents_: Union[List[pd.DataFrame], None]
@@ -914,7 +931,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
914
  @classmethod
915
  def from_file(
916
  cls,
917
- equation_file,
918
  *,
919
  binary_operators: Optional[List[str]] = None,
920
  unary_operators: Optional[List[str]] = None,
@@ -929,7 +946,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
929
 
930
  Parameters
931
  ----------
932
- equation_file : str
933
  Path to a pickle file containing a saved model, or a csv file
934
  containing equations.
935
  binary_operators : list[str]
@@ -996,7 +1013,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
996
 
997
  # TODO: copy .bkup file if exists.
998
  model = cls(
999
- equation_file=equation_file,
1000
  binary_operators=binary_operators,
1001
  unary_operators=unary_operators,
1002
  **pysr_kwargs,
@@ -1191,25 +1208,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1191
  index, list
1192
  ), "With multiple output features, index must be a list."
1193
  return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
1194
- elif isinstance(self.equations_, pd.DataFrame):
1195
- return cast(pd.Series, self.equations_.iloc[index])
1196
  else:
1197
- raise ValueError("No equations have been generated yet.")
 
1198
 
1199
  if isinstance(self.equations_, list):
1200
  return [
1201
  cast(pd.Series, eq.loc[idx_model_selection(eq, self.model_selection)])
1202
  for eq in self.equations_
1203
  ]
1204
- elif isinstance(self.equations_, pd.DataFrame):
 
1205
  return cast(
1206
  pd.Series,
1207
- self.equations_.loc[
1208
- idx_model_selection(self.equations_, self.model_selection)
1209
- ],
1210
  )
1211
- else:
1212
- raise ValueError("No equations have been generated yet.")
1213
 
1214
  def _setup_equation_file(self):
1215
  """
@@ -1234,7 +1247,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1234
  self.equation_file_ = self.equation_file
1235
  self.equation_file_contents_ = None
1236
 
1237
- def _validate_and_set_init_params(self):
1238
  """
1239
  Ensure parameters passed at initialization are valid.
1240
 
@@ -1292,55 +1305,36 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1292
  f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
1293
  )
1294
 
1295
- progress = self.progress
1296
- # 'Mutable' parameter validation
1297
- # (Params and their default values, if None is given:)
1298
- default_param_mapping = {
1299
- "binary_operators": "+ * - /".split(" "),
1300
- "unary_operators": [],
1301
- "maxdepth": self.maxsize,
1302
- "constraints": {},
1303
- "multithreading": self.procs != 0 and self.cluster_manager is None,
1304
- "batch_size": 1,
1305
- "update_verbosity": int(self.verbosity),
1306
- "progress": progress,
1307
- }
1308
- packed_modified_params = {}
1309
- for parameter, default_value in default_param_mapping.items():
1310
- parameter_value = getattr(self, parameter)
1311
- if parameter_value is None:
1312
- parameter_value = default_value
1313
  else:
1314
- # Special cases such as when binary_operators is a string
1315
- if parameter in ["binary_operators", "unary_operators"] and isinstance(
1316
- parameter_value, str
1317
- ):
1318
- parameter_value = [parameter_value]
1319
- elif parameter == "batch_size" and parameter_value < 1:
1320
- warnings.warn(
1321
- "Given `batch_size` must be greater than or equal to one. "
1322
- "`batch_size` has been increased to equal one."
1323
- )
1324
- parameter_value = 1
1325
- elif (
1326
- parameter == "progress"
1327
- and parameter_value
1328
- and "buffer" not in sys.stdout.__dir__()
1329
- ):
1330
- warnings.warn(
1331
- "Note: it looks like you are running in Jupyter. "
1332
- "The progress bar will be turned off."
1333
- )
1334
- parameter_value = False
1335
- packed_modified_params[parameter] = parameter_value
1336
 
1337
  assert (
1338
- len(packed_modified_params["binary_operators"])
1339
- + len(packed_modified_params["unary_operators"])
1340
- > 0
1341
- )
1342
 
1343
- return packed_modified_params
1344
 
1345
  def _validate_and_set_fit_params(
1346
  self, X, y, Xresampled, weights, variable_names, X_units, y_units
@@ -1568,20 +1562,27 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1568
 
1569
  return X, y, variable_names, X_units, y_units
1570
 
1571
- def _run(self, X, y, mutated_params, weights, seed: int):
 
 
 
 
 
 
 
1572
  """
1573
  Run the symbolic regression fitting process on the julia backend.
1574
 
1575
  Parameters
1576
  ----------
1577
- X : ndarray | pandas.DataFrame
1578
  Training data of shape `(n_samples, n_features)`.
1579
- y : ndarray | pandas.DataFrame
1580
  Target values of shape `(n_samples,)` or `(n_samples, n_targets)`.
1581
  Will be cast to `X`'s dtype if necessary.
1582
- mutated_params : dict[str, Any]
1583
- Dictionary of mutated versions of some parameters passed in __init__.
1584
- weights : ndarray | pandas.DataFrame
1585
  Weight array of the same shape as `y`.
1586
  Each element is how to weight the mean-square-error loss
1587
  for that particular element of y.
@@ -1604,17 +1605,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1604
 
1605
  # These are the parameters which may be modified from the ones
1606
  # specified in init, so we define them here locally:
1607
- binary_operators = mutated_params["binary_operators"]
1608
- unary_operators = mutated_params["unary_operators"]
1609
- maxdepth = mutated_params["maxdepth"]
1610
- constraints = mutated_params["constraints"]
1611
  nested_constraints = self.nested_constraints
1612
  complexity_of_operators = self.complexity_of_operators
1613
- multithreading = mutated_params["multithreading"]
1614
  cluster_manager = self.cluster_manager
1615
- batch_size = mutated_params["batch_size"]
1616
- update_verbosity = mutated_params["update_verbosity"]
1617
- progress = mutated_params["progress"]
 
1618
 
1619
  # Start julia backend processes
1620
  if not already_ran and update_verbosity != 0:
@@ -1656,6 +1658,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1656
  complexity_of_operators_str += f"({k}) => {v}, "
1657
  complexity_of_operators_str += ")"
1658
  complexity_of_operators = jl.seval(complexity_of_operators_str)
 
1659
 
1660
  custom_loss = jl.seval(
1661
  str(self.elementwise_loss)
@@ -1728,9 +1731,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1728
  fraction_replaced_hof=self.fraction_replaced_hof,
1729
  should_simplify=self.should_simplify,
1730
  should_optimize_constants=self.should_optimize_constants,
1731
- warmup_maxsize_by=(
1732
- 0.0 if self.warmup_maxsize_by is None else self.warmup_maxsize_by
1733
- ),
1734
  use_frequency=self.use_frequency,
1735
  use_frequency_in_tournament=self.use_frequency_in_tournament,
1736
  adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
@@ -1913,7 +1914,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1913
 
1914
  self._setup_equation_file()
1915
 
1916
- mutated_params = self._validate_and_set_init_params()
1917
 
1918
  (
1919
  X,
@@ -1939,7 +1940,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1939
  )
1940
 
1941
  random_state = check_random_state(self.random_state) # For np random
1942
- seed = random_state.randint(0, 2**31 - 1) # For julia random
1943
 
1944
  # Pre transformations (feature selection and denoising)
1945
  X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
@@ -1982,7 +1983,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1982
  self._checkpoint()
1983
 
1984
  # Perform the search:
1985
- self._run(X, y, mutated_params, weights=weights, seed=seed)
1986
 
1987
  # Then, after fit, we save again, so the pickle file contains
1988
  # the equations:
@@ -1991,7 +1992,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1991
 
1992
  return self
1993
 
1994
- def refresh(self, checkpoint_file=None) -> None:
1995
  """
1996
  Update self.equations_ with any new options passed.
1997
 
@@ -2000,11 +2001,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2000
 
2001
  Parameters
2002
  ----------
2003
- checkpoint_file : str
2004
  Path to checkpoint hall of fame file to be loaded.
2005
  The default will use the set `equation_file_`.
2006
  """
2007
- if checkpoint_file:
2008
  self.equation_file_ = checkpoint_file
2009
  self.equation_file_contents_ = None
2010
  check_is_fitted(self, attributes=["equation_file_"])
@@ -2457,3 +2458,30 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str):
2457
  f"{model_selection} is not a valid model selection strategy."
2458
  )
2459
  return chosen_idx
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import sys
9
  import tempfile
10
  import warnings
11
+ from dataclasses import dataclass, fields
12
  from datetime import datetime
13
  from io import StringIO
14
  from multiprocessing import cpu_count
 
49
  from .julia_import import SymbolicRegression, jl
50
  from .utils import (
51
  ArrayLike,
52
+ PathLike,
53
  _csv_filename_to_pkl_filename,
54
  _preprocess_julia_floats,
55
  _safe_check_feature_names_in,
 
184
  VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
185
 
186
 
187
+ @dataclass
188
+ class _DynamicallySetParams:
189
+ """Defines some parameters that are set at runtime."""
190
+
191
+ binary_operators: List[str]
192
+ unary_operators: List[str]
193
+ maxdepth: int
194
+ constraints: Dict[str, str]
195
+ multithreading: bool
196
+ batch_size: int
197
+ update_verbosity: int
198
+ progress: bool
199
+ warmup_maxsize_by: float
200
+
201
+
202
  class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
203
  """
204
  High-performance symbolic regression algorithm.
 
693
  nout_: int
694
  selection_mask_: Union[NDArray[np.bool_], None]
695
  tempdir_: Path
696
+ equation_file_: PathLike
697
  julia_state_stream_: Union[NDArray[np.uint8], None]
698
  julia_options_stream_: Union[NDArray[np.uint8], None]
699
  equation_file_contents_: Union[List[pd.DataFrame], None]
 
931
  @classmethod
932
  def from_file(
933
  cls,
934
+ equation_file: PathLike,
935
  *,
936
  binary_operators: Optional[List[str]] = None,
937
  unary_operators: Optional[List[str]] = None,
 
946
 
947
  Parameters
948
  ----------
949
+ equation_file : str or Path
950
  Path to a pickle file containing a saved model, or a csv file
951
  containing equations.
952
  binary_operators : list[str]
 
1013
 
1014
  # TODO: copy .bkup file if exists.
1015
  model = cls(
1016
+ equation_file=str(equation_file),
1017
  binary_operators=binary_operators,
1018
  unary_operators=unary_operators,
1019
  **pysr_kwargs,
 
1208
  index, list
1209
  ), "With multiple output features, index must be a list."
1210
  return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
 
 
1211
  else:
1212
+ equations_ = cast(pd.DataFrame, self.equations_)
1213
+ return cast(pd.Series, equations_.iloc[index])
1214
 
1215
  if isinstance(self.equations_, list):
1216
  return [
1217
  cast(pd.Series, eq.loc[idx_model_selection(eq, self.model_selection)])
1218
  for eq in self.equations_
1219
  ]
1220
+ else:
1221
+ equations_ = cast(pd.DataFrame, self.equations_)
1222
  return cast(
1223
  pd.Series,
1224
+ equations_.loc[idx_model_selection(equations_, self.model_selection)],
 
 
1225
  )
 
 
1226
 
1227
  def _setup_equation_file(self):
1228
  """
 
1247
  self.equation_file_ = self.equation_file
1248
  self.equation_file_contents_ = None
1249
 
1250
+ def _validate_and_modify_params(self) -> _DynamicallySetParams:
1251
  """
1252
  Ensure parameters passed at initialization are valid.
1253
 
 
1305
  f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
1306
  )
1307
 
1308
+ param_container = _DynamicallySetParams(
1309
+ binary_operators=["+", "*", "-", "/"],
1310
+ unary_operators=[],
1311
+ maxdepth=self.maxsize,
1312
+ constraints={},
1313
+ multithreading=self.procs != 0 and self.cluster_manager is None,
1314
+ batch_size=1,
1315
+ update_verbosity=int(self.verbosity),
1316
+ progress=self.progress,
1317
+ warmup_maxsize_by=0.0,
1318
+ )
1319
+
1320
+ for param_name in map(lambda x: x.name, fields(_DynamicallySetParams)):
1321
+ user_param_value = getattr(self, param_name)
1322
+ if user_param_value is None:
1323
+ # Leave as the default in DynamicallySetParams
1324
+ ...
 
1325
  else:
1326
+ # If user has specified it, we will override the default.
1327
+ # However, there are some special cases to mutate it:
1328
+ new_param_value = _mutate_parameter(param_name, user_param_value)
1329
+ setattr(param_container, param_name, new_param_value)
1330
+ # TODO: This should just be part of the __init__ of _DynamicallySetParams
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1331
 
1332
  assert (
1333
+ len(param_container.binary_operators) > 0
1334
+ or len(param_container.unary_operators) > 0
1335
+ ), "At least one operator must be provided."
 
1336
 
1337
+ return param_container
1338
 
1339
  def _validate_and_set_fit_params(
1340
  self, X, y, Xresampled, weights, variable_names, X_units, y_units
 
1562
 
1563
  return X, y, variable_names, X_units, y_units
1564
 
1565
+ def _run(
1566
+ self,
1567
+ X: ndarray,
1568
+ y: ndarray,
1569
+ runtime_params: _DynamicallySetParams,
1570
+ weights: Optional[ndarray],
1571
+ seed: int,
1572
+ ):
1573
  """
1574
  Run the symbolic regression fitting process on the julia backend.
1575
 
1576
  Parameters
1577
  ----------
1578
+ X : ndarray
1579
  Training data of shape `(n_samples, n_features)`.
1580
+ y : ndarray
1581
  Target values of shape `(n_samples,)` or `(n_samples, n_targets)`.
1582
  Will be cast to `X`'s dtype if necessary.
1583
+ runtime_params : DynamicallySetParams
1584
+ Dynamically set versions of some parameters passed in __init__.
1585
+ weights : ndarray | None
1586
  Weight array of the same shape as `y`.
1587
  Each element is how to weight the mean-square-error loss
1588
  for that particular element of y.
 
1605
 
1606
  # These are the parameters which may be modified from the ones
1607
  # specified in init, so we define them here locally:
1608
+ binary_operators = runtime_params.binary_operators
1609
+ unary_operators = runtime_params.unary_operators
1610
+ maxdepth = runtime_params.maxdepth
1611
+ constraints = runtime_params.constraints
1612
  nested_constraints = self.nested_constraints
1613
  complexity_of_operators = self.complexity_of_operators
1614
+ multithreading = runtime_params.multithreading
1615
  cluster_manager = self.cluster_manager
1616
+ batch_size = runtime_params.batch_size
1617
+ update_verbosity = runtime_params.update_verbosity
1618
+ progress = runtime_params.progress
1619
+ warmup_maxsize_by = runtime_params.warmup_maxsize_by
1620
 
1621
  # Start julia backend processes
1622
  if not already_ran and update_verbosity != 0:
 
1658
  complexity_of_operators_str += f"({k}) => {v}, "
1659
  complexity_of_operators_str += ")"
1660
  complexity_of_operators = jl.seval(complexity_of_operators_str)
1661
+ # TODO: Refactor this into helper function
1662
 
1663
  custom_loss = jl.seval(
1664
  str(self.elementwise_loss)
 
1731
  fraction_replaced_hof=self.fraction_replaced_hof,
1732
  should_simplify=self.should_simplify,
1733
  should_optimize_constants=self.should_optimize_constants,
1734
+ warmup_maxsize_by=warmup_maxsize_by,
 
 
1735
  use_frequency=self.use_frequency,
1736
  use_frequency_in_tournament=self.use_frequency_in_tournament,
1737
  adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
 
1914
 
1915
  self._setup_equation_file()
1916
 
1917
+ runtime_params = self._validate_and_modify_params()
1918
 
1919
  (
1920
  X,
 
1940
  )
1941
 
1942
  random_state = check_random_state(self.random_state) # For np random
1943
+ seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random
1944
 
1945
  # Pre transformations (feature selection and denoising)
1946
  X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
 
1983
  self._checkpoint()
1984
 
1985
  # Perform the search:
1986
+ self._run(X, y, runtime_params, weights=weights, seed=seed)
1987
 
1988
  # Then, after fit, we save again, so the pickle file contains
1989
  # the equations:
 
1992
 
1993
  return self
1994
 
1995
+ def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None:
1996
  """
1997
  Update self.equations_ with any new options passed.
1998
 
 
2001
 
2002
  Parameters
2003
  ----------
2004
+ checkpoint_file : str or Path
2005
  Path to checkpoint hall of fame file to be loaded.
2006
  The default will use the set `equation_file_`.
2007
  """
2008
+ if checkpoint_file is not None:
2009
  self.equation_file_ = checkpoint_file
2010
  self.equation_file_contents_ = None
2011
  check_is_fitted(self, attributes=["equation_file_"])
 
2458
  f"{model_selection} is not a valid model selection strategy."
2459
  )
2460
  return chosen_idx
2461
+
2462
+
2463
+ def _mutate_parameter(param_name: str, param_value):
2464
+ if param_name in ["binary_operators", "unary_operators"] and isinstance(
2465
+ param_value, str
2466
+ ):
2467
+ return [param_value]
2468
+
2469
+ if param_name == "batch_size" and param_value < 1:
2470
+ warnings.warn(
2471
+ "Given `batch_size` must be greater than or equal to one. "
2472
+ "`batch_size` has been increased to equal one."
2473
+ )
2474
+ return 1
2475
+
2476
+ if (
2477
+ param_name == "progress"
2478
+ and param_value == True
2479
+ and "buffer" not in sys.stdout.__dir__()
2480
+ ):
2481
+ warnings.warn(
2482
+ "Note: it looks like you are running in Jupyter. "
2483
+ "The progress bar will be turned off."
2484
+ )
2485
+ return False
2486
+
2487
+ return param_value
pysr/utils.py CHANGED
@@ -7,10 +7,12 @@ from numpy import ndarray
7
  from sklearn.utils.validation import _check_feature_names_in # type: ignore
8
 
9
  T = TypeVar("T", bound=Any)
 
10
  ArrayLike = Union[ndarray, List[T]]
 
11
 
12
 
13
- def _csv_filename_to_pkl_filename(csv_filename: Union[str, Path]) -> Union[str, Path]:
14
  if os.path.splitext(csv_filename)[1] == ".pkl":
15
  return csv_filename
16
 
 
7
  from sklearn.utils.validation import _check_feature_names_in # type: ignore
8
 
9
  T = TypeVar("T", bound=Any)
10
+
11
  ArrayLike = Union[ndarray, List[T]]
12
+ PathLike = Union[str, Path]
13
 
14
 
15
+ def _csv_filename_to_pkl_filename(csv_filename: PathLike) -> PathLike:
16
  if os.path.splitext(csv_filename)[1] == ".pkl":
17
  return csv_filename
18