MilesCranmer commited on
Commit
5750d1a
1 Parent(s): fdb138f

Add denoising operation with test

Browse files
Files changed (2) hide show
  1. pysr/sr.py +37 -0
  2. test/test.py +17 -0
pysr/sr.py CHANGED
@@ -130,6 +130,8 @@ def pysr(
130
  optimizer_iterations=10,
131
  tournament_selection_n=10,
132
  tournament_selection_p=1.0,
 
 
133
  ):
134
  """Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
135
  Note: most default parameters have been tuned over several example
@@ -244,6 +246,8 @@ def pysr(
244
  :type tournament_selection_n: int
245
  :param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
246
  :type tournament_selection_p: float
 
 
247
  :returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
248
  :type: pd.DataFrame/list
249
  """
@@ -327,6 +331,24 @@ def pysr(
327
  else:
328
  raise NotImplementedError("y shape not supported!")
329
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  kwargs = dict(
331
  X=X,
332
  y=y,
@@ -387,6 +409,7 @@ def pysr(
387
  nout=nout,
388
  tournament_selection_n=tournament_selection_n,
389
  tournament_selection_p=tournament_selection_p,
 
390
  )
391
 
392
  kwargs = {**_set_paths(tempdir), **kwargs}
@@ -1082,6 +1105,20 @@ def _yesno(question):
1082
  return False
1083
 
1084
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1085
  class CallableEquation(object):
1086
  """Simple wrapper for numpy lambda functions built with sympy"""
1087
 
 
130
  optimizer_iterations=10,
131
  tournament_selection_n=10,
132
  tournament_selection_p=1.0,
133
+ denoise=False,
134
+ Xresampled=None,
135
  ):
136
  """Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
137
  Note: most default parameters have been tuned over several example
 
246
  :type tournament_selection_n: int
247
  :param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
248
  :type tournament_selection_p: float
249
+ :param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data.
250
+ :type denoise: bool
251
  :returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
252
  :type: pd.DataFrame/list
253
  """
 
331
  else:
332
  raise NotImplementedError("y shape not supported!")
333
 
334
+ if denoise:
335
+ if weights is not None:
336
+ raise NotImplementedError(
337
+ "No weights for denoising - the weights are learned."
338
+ )
339
+ if Xresampled is not None and selection is not None:
340
+ # Select among only the selected features:
341
+ Xresampled = Xresampled[:, selection]
342
+ if multioutput:
343
+ y = np.stack(
344
+ [_denoise(X, y[:, i], Xresampled=Xresampled)[1] for i in range(nout)],
345
+ axis=1,
346
+ )
347
+ if Xresampled is not None:
348
+ X = Xresampled
349
+ else:
350
+ X, y = _denoise(X, y, Xresampled=Xresampled)
351
+
352
  kwargs = dict(
353
  X=X,
354
  y=y,
 
409
  nout=nout,
410
  tournament_selection_n=tournament_selection_n,
411
  tournament_selection_p=tournament_selection_p,
412
+ denoise=denoise,
413
  )
414
 
415
  kwargs = {**_set_paths(tempdir), **kwargs}
 
1105
  return False
1106
 
1107
 
1108
+ def _denoise(X, y, Xresampled=None):
1109
+ """Denoise the dataset using a Gaussian process"""
1110
+ from sklearn.gaussian_process import GaussianProcessRegressor
1111
+ from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
1112
+
1113
+ gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
1114
+ gpr = GaussianProcessRegressor(kernel=gp_kernel, n_restarts_optimizer=50)
1115
+ gpr.fit(X, y)
1116
+ if Xresampled is not None:
1117
+ return Xresampled, gpr.predict(Xresampled)
1118
+
1119
+ return X, gpr.predict(X)
1120
+
1121
+
1122
  class CallableEquation(object):
1123
  """Simple wrapper for numpy lambda functions built with sympy"""
1124
 
test/test.py CHANGED
@@ -82,6 +82,23 @@ class TestPipeline(unittest.TestCase):
82
 
83
  self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  class TestBest(unittest.TestCase):
87
  def setUp(self):
 
82
 
83
  self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
84
 
85
+ def test_noisy(self):
86
+
87
+ np.random.seed(1)
88
+ y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0]) * 0.05
89
+ equations = pysr(
90
+ self.X,
91
+ y,
92
+ unary_operators=["sq(x) = x^2"],
93
+ binary_operators=["plus"],
94
+ extra_sympy_mappings={"sq": lambda x: x ** 2},
95
+ **self.default_test_kwargs,
96
+ procs=0,
97
+ denoise=True,
98
+ )
99
+ self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-4)
100
+ self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-4)
101
+
102
 
103
  class TestBest(unittest.TestCase):
104
  def setUp(self):