MilesCranmer commited on
Commit
ad84a1c
1 Parent(s): 69aa240

Add more helpful warnings

Browse files
Files changed (2) hide show
  1. pysr/sr.py +15 -1
  2. test/test.py +28 -2
pysr/sr.py CHANGED
@@ -691,7 +691,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
691
 
692
  if maxsize > 40:
693
  warnings.warn(
694
- "Note: Using a large maxsize for the equation search will be exponentially slower and use significant memory. You should consider turning `use_frequency` to False, and perhaps use `warmup_maxsize_by`."
695
  )
696
  elif maxsize < 7:
697
  raise NotImplementedError("PySR requires a maxsize of at least 7")
@@ -1147,6 +1147,20 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
1147
  "Note: you are running with more than 10,000 datapoints. You should consider turning on batching (https://astroautomata.com/PySR/#/options?id=batching). You should also reconsider if you need that many datapoints. Unless you have a large amount of noise (in which case you should smooth your dataset first), generally < 10,000 datapoints is enough to find a functional form with symbolic regression. More datapoints will lower the search speed."
1148
  )
1149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1150
  X, selection = _handle_feature_selection(
1151
  X, select_k_features, y, variable_names
1152
  )
 
691
 
692
  if maxsize > 40:
693
  warnings.warn(
694
+ "Note: Using a large maxsize for the equation search will be exponentially slower and use significant memory."
695
  )
696
  elif maxsize < 7:
697
  raise NotImplementedError("PySR requires a maxsize of at least 7")
 
1147
  "Note: you are running with more than 10,000 datapoints. You should consider turning on batching (https://astroautomata.com/PySR/#/options?id=batching). You should also reconsider if you need that many datapoints. Unless you have a large amount of noise (in which case you should smooth your dataset first), generally < 10,000 datapoints is enough to find a functional form with symbolic regression. More datapoints will lower the search speed."
1148
  )
1149
 
1150
+ if self.n_features >= 10 and not select_k_features:
1151
+ warnings.warn(
1152
+ "Note: you are running with 10 features or more. "
1153
+ "Genetic algorithms like used in PySR scale poorly with large numbers of features. "
1154
+ "Consider using feature selection techniques to select the most important features "
1155
+ "(you can do this automatically with the `select_k_features` parameter), "
1156
+ "or, alternatively, doing a dimensionality reduction beforehand. "
1157
+ "For example, `X = PCA(n_components=6).fit_transform(X)`, "
1158
+ "using scikit-learn's `PCA` class, "
1159
+ "will reduce the number of features to 6 in an interpretable way, "
1160
+ "as each resultant feature "
1161
+ "will be a linear combination of the original features. "
1162
+ )
1163
+
1164
  X, selection = _handle_feature_selection(
1165
  X, select_k_features, y, variable_names
1166
  )
test/test.py CHANGED
@@ -7,6 +7,7 @@ from pysr.sr import run_feature_selection, _handle_feature_selection
7
  import sympy
8
  from sympy import lambdify
9
  import pandas as pd
 
10
 
11
 
12
  class TestPipeline(unittest.TestCase):
@@ -275,11 +276,36 @@ class TestMiscellaneous(unittest.TestCase):
275
  """Test miscellaneous functions."""
276
 
277
  def test_deprecation(self):
278
- # Ensure that deprecation works as expected, with a warning,
279
- # and sets the correct value.
 
 
280
  with self.assertWarns(UserWarning):
281
  model = PySRRegressor(fractionReplaced=0.2)
282
  # This is a deprecated parameter, so we should get a warning.
283
 
284
  # The correct value should be set:
285
  self.assertEqual(model.params["fraction_replaced"], 0.2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import sympy
8
  from sympy import lambdify
9
  import pandas as pd
10
+ import warnings
11
 
12
 
13
  class TestPipeline(unittest.TestCase):
 
276
  """Test miscellaneous functions."""
277
 
278
  def test_deprecation(self):
279
+ """Ensure that deprecation works as expected.
280
+
281
+ This should give a warning, and sets the correct value.
282
+ """
283
  with self.assertWarns(UserWarning):
284
  model = PySRRegressor(fractionReplaced=0.2)
285
  # This is a deprecated parameter, so we should get a warning.
286
 
287
  # The correct value should be set:
288
  self.assertEqual(model.params["fraction_replaced"], 0.2)
289
+
290
+ def test_size_warning(self):
291
+ """Ensure that a warning is given for a large input size."""
292
+ model = PySRRegressor()
293
+ X = np.random.randn(10001, 2)
294
+ y = np.random.randn(10001)
295
+ with warnings.catch_warnings():
296
+ warnings.simplefilter("error")
297
+ with self.assertRaises(Exception) as context:
298
+ model.fit(X, y)
299
+ self.assertIn("more than 10,000", str(context.exception))
300
+
301
+ def test_feature_warning(self):
302
+ """Ensure that a warning is given for large number of features."""
303
+ model = PySRRegressor()
304
+ X = np.random.randn(100, 10)
305
+ y = np.random.randn(100)
306
+ with warnings.catch_warnings():
307
+ warnings.simplefilter("error")
308
+ with self.assertRaises(Exception) as context:
309
+ model.fit(X, y)
310
+ self.assertIn("with 10 features or more", str(context.exception))
311
+