Spaces:
Running
Running
from pysr import pysr, best_row | |
from sklearn.base import BaseEstimator, RegressorMixin | |
import inspect | |
import pandas as pd | |
class PySRRegressor(BaseEstimator, RegressorMixin): | |
def __init__(self, model_selection="accuracy", **params): | |
"""Initialize settings for pysr.pysr call. | |
:param model_selection: How to select a model. Can be 'accuracy' or 'best'. 'best' will optimize a combination of complexity and accuracy. | |
:type model_selection: str | |
""" | |
super().__init__() | |
self.model_selection = model_selection | |
self.params = params | |
# Stored equations: | |
self.equations = None | |
def __repr__(self): | |
if self.equations is None: | |
return "PySRRegressor.equations=None" | |
equations = self.equations | |
selected = ["" for _ in range(len(equations))] | |
if self.model_selection == "accuracy": | |
chosen_row = -1 | |
elif self.model_selection == "best": | |
chosen_row = equations["score"].idxmax() | |
else: | |
raise NotImplementedError | |
selected[chosen_row] = ">>>>" | |
output = "PySRRegressor.equations = [\n" | |
repr_equations = pd.DataFrame( | |
dict( | |
pick=selected, | |
score=equations["score"], | |
Equation=equations["Equation"], | |
MSE=equations["MSE"], | |
Complexity=equations["Complexity"], | |
) | |
) | |
output += repr_equations.__repr__() | |
output += "\n]" | |
return output | |
def set_params(self, **params): | |
"""Set parameters for pysr.pysr call or model_selection strategy.""" | |
for key, value in params.items(): | |
if key == "model_selection": | |
self.model_selection = value | |
self.params[key] = value | |
return self | |
def get_params(self, deep=True): | |
del deep | |
return {**self.params, "model_selection": self.model_selection} | |
def get_best(self): | |
if self.equations is None: | |
return 0.0 | |
if self.model_selection == "accuracy": | |
return self.equations.iloc[-1] | |
elif self.model_selection == "best": | |
return best_row(self.equations) | |
else: | |
raise NotImplementedError | |
def fit(self, X, y, weights=None, variable_names=None): | |
"""Search for equations to fit the dataset. | |
:param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces). | |
:type X: np.ndarray/pandas.DataFrame | |
:param y: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs). Putting in a 2D array will trigger a search for equations for each feature of y. | |
:type y: np.ndarray | |
:param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y. | |
:type weights: np.ndarray | |
:param variable_names: a list of names for the variables, other than "x0", "x1", etc. | |
:type variable_names: list | |
""" | |
if variable_names is None: | |
if "variable_names" in self.params: | |
variable_names = self.params["variable_names"] | |
self.equations = pysr( | |
X=X, | |
y=y, | |
weights=weights, | |
variable_names=variable_names, | |
**{k: v for k, v in self.params.items() if k != "variable_names"}, | |
) | |
return self | |
def predict(self, X): | |
equation_row = self.get_best() | |
np_format = equation_row["lambda_format"] | |
return np_format(X) | |
# Add the docs from pysr() to PySRRegressor(): | |
_pysr_docstring_split = [] | |
_start_recording = False | |
for line in inspect.getdoc(pysr).split("\n"): | |
# Skip docs on "X" and "y" | |
if ":param binary_operators:" in line: | |
_start_recording = True | |
if ":returns:" in line: | |
_start_recording = False | |
if _start_recording: | |
_pysr_docstring_split.append(line) | |
_pysr_docstring = "\n\t".join(_pysr_docstring_split) | |
PySRRegressor.__init__.__doc__ += _pysr_docstring | |