Spaces:
Running
Running
MilesCranmer
commited on
Commit
•
fd4c500
1
Parent(s):
b958ebf
fix: variety of typing information
Browse files- pysr/denoising.py +17 -4
- pysr/feature_selection.py +19 -3
- pysr/julia_helpers.py +4 -0
- pysr/julia_import.py +3 -2
- pysr/sr.py +41 -26
pysr/denoising.py
CHANGED
@@ -1,9 +1,17 @@
|
|
1 |
"""Functions for denoising data during preprocessing."""
|
2 |
|
|
|
|
|
3 |
import numpy as np
|
|
|
4 |
|
5 |
|
6 |
-
def denoise(
|
|
|
|
|
|
|
|
|
|
|
7 |
"""Denoise the dataset using a Gaussian process."""
|
8 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
9 |
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
@@ -15,12 +23,17 @@ def denoise(X, y, Xresampled=None, random_state=None):
|
|
15 |
gpr.fit(X, y)
|
16 |
|
17 |
if Xresampled is not None:
|
18 |
-
return Xresampled, gpr.predict(Xresampled)
|
19 |
|
20 |
-
return X, gpr.predict(X)
|
21 |
|
22 |
|
23 |
-
def multi_denoise(
|
|
|
|
|
|
|
|
|
|
|
24 |
"""Perform `denoise` along each column of `y` independently."""
|
25 |
y = np.stack(
|
26 |
[
|
|
|
1 |
"""Functions for denoising data during preprocessing."""
|
2 |
|
3 |
+
from typing import Optional, Tuple, cast
|
4 |
+
|
5 |
import numpy as np
|
6 |
+
from numpy import ndarray
|
7 |
|
8 |
|
9 |
+
def denoise(
|
10 |
+
X: ndarray,
|
11 |
+
y: ndarray,
|
12 |
+
Xresampled: Optional[ndarray] = None,
|
13 |
+
random_state: Optional[np.random.RandomState] = None,
|
14 |
+
) -> Tuple[ndarray, ndarray]:
|
15 |
"""Denoise the dataset using a Gaussian process."""
|
16 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
17 |
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
|
|
23 |
gpr.fit(X, y)
|
24 |
|
25 |
if Xresampled is not None:
|
26 |
+
return Xresampled, cast(ndarray, gpr.predict(Xresampled))
|
27 |
|
28 |
+
return X, cast(ndarray, gpr.predict(X))
|
29 |
|
30 |
|
31 |
+
def multi_denoise(
|
32 |
+
X: ndarray,
|
33 |
+
y: ndarray,
|
34 |
+
Xresampled: Optional[ndarray] = None,
|
35 |
+
random_state: Optional[np.random.RandomState] = None,
|
36 |
+
):
|
37 |
"""Perform `denoise` along each column of `y` independently."""
|
38 |
y = np.stack(
|
39 |
[
|
pysr/feature_selection.py
CHANGED
@@ -1,9 +1,20 @@
|
|
1 |
"""Functions for doing feature selection during preprocessing."""
|
2 |
|
|
|
|
|
3 |
import numpy as np
|
|
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
-
def run_feature_selection(
|
|
|
|
|
|
|
|
|
|
|
7 |
"""
|
8 |
Find most important features.
|
9 |
|
@@ -21,11 +32,16 @@ def run_feature_selection(X, y, select_k_features, random_state=None):
|
|
21 |
selector = SelectFromModel(
|
22 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
23 |
)
|
24 |
-
return selector.get_support(indices=True)
|
25 |
|
26 |
|
27 |
# Function has not been removed only due to usage in module tests
|
28 |
-
def _handle_feature_selection(
|
|
|
|
|
|
|
|
|
|
|
29 |
if select_k_features is not None:
|
30 |
selection = run_feature_selection(X, y, select_k_features)
|
31 |
print(f"Using features {[variable_names[i] for i in selection]}")
|
|
|
1 |
"""Functions for doing feature selection during preprocessing."""
|
2 |
|
3 |
+
from typing import Optional, cast
|
4 |
+
|
5 |
import numpy as np
|
6 |
+
from numpy import ndarray
|
7 |
+
from numpy.typing import NDArray
|
8 |
+
|
9 |
+
from .utils import ArrayLike
|
10 |
|
11 |
|
12 |
+
def run_feature_selection(
|
13 |
+
X: ndarray,
|
14 |
+
y: ndarray,
|
15 |
+
select_k_features: int,
|
16 |
+
random_state: Optional[np.random.RandomState] = None,
|
17 |
+
) -> NDArray[np.intp]:
|
18 |
"""
|
19 |
Find most important features.
|
20 |
|
|
|
32 |
selector = SelectFromModel(
|
33 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
34 |
)
|
35 |
+
return cast(NDArray[np.intp], selector.get_support(indices=True))
|
36 |
|
37 |
|
38 |
# Function has not been removed only due to usage in module tests
|
39 |
+
def _handle_feature_selection(
|
40 |
+
X: ndarray,
|
41 |
+
select_k_features: Optional[int],
|
42 |
+
y: ndarray,
|
43 |
+
variable_names: ArrayLike[str],
|
44 |
+
):
|
45 |
if select_k_features is not None:
|
46 |
selection = run_feature_selection(X, y, select_k_features)
|
47 |
print(f"Using features {[variable_names[i] for i in selection]}")
|
pysr/julia_helpers.py
CHANGED
@@ -1,11 +1,15 @@
|
|
1 |
"""Functions for initializing the Julia environment and installing deps."""
|
2 |
|
|
|
|
|
3 |
import numpy as np
|
4 |
from juliacall import convert as jl_convert # type: ignore
|
5 |
|
6 |
from .deprecated import init_julia, install
|
7 |
from .julia_import import jl
|
8 |
|
|
|
|
|
9 |
jl.seval("using Serialization: Serialization")
|
10 |
jl.seval("using PythonCall: PythonCall")
|
11 |
|
|
|
1 |
"""Functions for initializing the Julia environment and installing deps."""
|
2 |
|
3 |
+
from typing import Any, Callable, cast
|
4 |
+
|
5 |
import numpy as np
|
6 |
from juliacall import convert as jl_convert # type: ignore
|
7 |
|
8 |
from .deprecated import init_julia, install
|
9 |
from .julia_import import jl
|
10 |
|
11 |
+
jl_convert = cast(Callable[[Any, Any], Any], jl_convert)
|
12 |
+
|
13 |
jl.seval("using Serialization: Serialization")
|
14 |
jl.seval("using PythonCall: PythonCall")
|
15 |
|
pysr/julia_import.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import os
|
2 |
import sys
|
3 |
import warnings
|
4 |
-
from
|
|
|
5 |
|
6 |
# Check if JuliaCall is already loaded, and if so, warn the user
|
7 |
# about the relevant environment variables. If not loaded,
|
@@ -43,7 +44,7 @@ if autoload_extensions is not None:
|
|
43 |
|
44 |
from juliacall import Main as jl # type: ignore
|
45 |
|
46 |
-
jl
|
47 |
|
48 |
|
49 |
jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
import warnings
|
4 |
+
from types import ModuleType
|
5 |
+
from typing import cast
|
6 |
|
7 |
# Check if JuliaCall is already loaded, and if so, warn the user
|
8 |
# about the relevant environment variables. If not loaded,
|
|
|
44 |
|
45 |
from juliacall import Main as jl # type: ignore
|
46 |
|
47 |
+
jl = cast(ModuleType, jl)
|
48 |
|
49 |
|
50 |
jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
|
pysr/sr.py
CHANGED
@@ -679,7 +679,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
679 |
X_units_: Optional[ArrayLike[str]]
|
680 |
y_units_: Optional[Union[str, ArrayLike[str]]]
|
681 |
nout_: int
|
682 |
-
selection_mask_: Optional[NDArray[np.
|
683 |
tempdir_: Path
|
684 |
equation_file_: Union[str, Path]
|
685 |
julia_state_stream_: Optional[NDArray[np.uint8]]
|
@@ -921,12 +921,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
921 |
cls,
|
922 |
equation_file,
|
923 |
*,
|
924 |
-
binary_operators=None,
|
925 |
-
unary_operators=None,
|
926 |
-
n_features_in=None,
|
927 |
-
feature_names_in=None,
|
928 |
-
selection_mask=None,
|
929 |
-
nout=1,
|
930 |
**pysr_kwargs,
|
931 |
):
|
932 |
"""
|
@@ -949,7 +949,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
949 |
feature_names_in : list[str]
|
950 |
Names of the features passed to the model.
|
951 |
Not needed if loading from a pickle file.
|
952 |
-
selection_mask :
|
953 |
If using `select_k_features`, you must pass `model.selection_mask_` here.
|
954 |
Not needed if loading from a pickle file.
|
955 |
nout : int
|
@@ -1021,7 +1021,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1021 |
model.display_feature_names_in_ = feature_names_in
|
1022 |
|
1023 |
if selection_mask is None:
|
1024 |
-
model.selection_mask_ = np.
|
1025 |
else:
|
1026 |
model.selection_mask_ = selection_mask
|
1027 |
|
@@ -1197,19 +1197,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1197 |
), "With multiple output features, index must be a list."
|
1198 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
1199 |
elif isinstance(self.equations_, pd.DataFrame):
|
1200 |
-
return self.equations_.iloc[index]
|
1201 |
else:
|
1202 |
raise ValueError("No equations have been generated yet.")
|
1203 |
|
1204 |
if isinstance(self.equations_, list):
|
1205 |
return [
|
1206 |
-
eq.loc[idx_model_selection(eq, self.model_selection)]
|
1207 |
for eq in self.equations_
|
1208 |
]
|
1209 |
elif isinstance(self.equations_, pd.DataFrame):
|
1210 |
-
return
|
1211 |
-
|
1212 |
-
|
|
|
|
|
|
|
1213 |
else:
|
1214 |
raise ValueError("No equations have been generated yet.")
|
1215 |
|
@@ -1351,7 +1354,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1351 |
ndarray,
|
1352 |
Optional[ndarray],
|
1353 |
Optional[ndarray],
|
1354 |
-
|
1355 |
Optional[ArrayLike[str]],
|
1356 |
Optional[Union[str, ArrayLike[str]]],
|
1357 |
]:
|
@@ -1459,13 +1462,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1459 |
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
1460 |
|
1461 |
def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
|
1462 |
-
|
|
|
1463 |
|
1464 |
def _validate_data_X(self, X) -> Tuple[ndarray]:
|
1465 |
-
|
|
|
1466 |
|
1467 |
def _pre_transform_training_data(
|
1468 |
-
self,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1469 |
):
|
1470 |
"""
|
1471 |
Transform the training data before fitting the symbolic regressor.
|
@@ -1474,12 +1486,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1474 |
|
1475 |
Parameters
|
1476 |
----------
|
1477 |
-
X : ndarray
|
1478 |
Training data of shape (n_samples, n_features).
|
1479 |
-
y : ndarray
|
1480 |
Target values of shape (n_samples,) or (n_samples, n_targets).
|
1481 |
Will be cast to X's dtype if necessary.
|
1482 |
-
Xresampled : ndarray |
|
1483 |
Resampled training data, of shape `(n_resampled, n_features)`,
|
1484 |
used for denoising.
|
1485 |
variable_names : list[str]
|
@@ -1517,24 +1529,27 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1517 |
"""
|
1518 |
# Feature selection transformation
|
1519 |
if self.select_k_features:
|
1520 |
-
|
1521 |
X, y, self.select_k_features, random_state=random_state
|
1522 |
)
|
1523 |
-
X = X[:,
|
1524 |
|
1525 |
if Xresampled is not None:
|
1526 |
-
Xresampled = Xresampled[:,
|
1527 |
|
1528 |
# Reduce variable_names to selection
|
1529 |
-
variable_names =
|
|
|
|
|
1530 |
|
1531 |
if X_units is not None:
|
1532 |
-
X_units = [X_units[i] for i in
|
1533 |
self.X_units_ = copy.deepcopy(X_units)
|
1534 |
|
1535 |
# Re-perform data validation and feature name updating
|
1536 |
X, y = self._validate_data_X_y(X, y)
|
1537 |
# Update feature names with selected variable names
|
|
|
1538 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
1539 |
self.display_feature_names_in_ = self.feature_names_in_
|
1540 |
print(f"Using features {self.feature_names_in_}")
|
|
|
679 |
X_units_: Optional[ArrayLike[str]]
|
680 |
y_units_: Optional[Union[str, ArrayLike[str]]]
|
681 |
nout_: int
|
682 |
+
selection_mask_: Optional[NDArray[np.intp]]
|
683 |
tempdir_: Path
|
684 |
equation_file_: Union[str, Path]
|
685 |
julia_state_stream_: Optional[NDArray[np.uint8]]
|
|
|
921 |
cls,
|
922 |
equation_file,
|
923 |
*,
|
924 |
+
binary_operators: Optional[List[str]] = None,
|
925 |
+
unary_operators: Optional[List[str]] = None,
|
926 |
+
n_features_in: Optional[int] = None,
|
927 |
+
feature_names_in: Optional[ArrayLike[str]] = None,
|
928 |
+
selection_mask: Optional[NDArray[np.intp]] = None,
|
929 |
+
nout: int = 1,
|
930 |
**pysr_kwargs,
|
931 |
):
|
932 |
"""
|
|
|
949 |
feature_names_in : list[str]
|
950 |
Names of the features passed to the model.
|
951 |
Not needed if loading from a pickle file.
|
952 |
+
selection_mask : NDArray[np.intp]
|
953 |
If using `select_k_features`, you must pass `model.selection_mask_` here.
|
954 |
Not needed if loading from a pickle file.
|
955 |
nout : int
|
|
|
1021 |
model.display_feature_names_in_ = feature_names_in
|
1022 |
|
1023 |
if selection_mask is None:
|
1024 |
+
model.selection_mask_ = np.arange(n_features_in, dtype=np.intp)
|
1025 |
else:
|
1026 |
model.selection_mask_ = selection_mask
|
1027 |
|
|
|
1197 |
), "With multiple output features, index must be a list."
|
1198 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
1199 |
elif isinstance(self.equations_, pd.DataFrame):
|
1200 |
+
return cast(pd.Series, self.equations_.iloc[index])
|
1201 |
else:
|
1202 |
raise ValueError("No equations have been generated yet.")
|
1203 |
|
1204 |
if isinstance(self.equations_, list):
|
1205 |
return [
|
1206 |
+
cast(pd.Series, eq.loc[idx_model_selection(eq, self.model_selection)])
|
1207 |
for eq in self.equations_
|
1208 |
]
|
1209 |
elif isinstance(self.equations_, pd.DataFrame):
|
1210 |
+
return cast(
|
1211 |
+
pd.Series,
|
1212 |
+
self.equations_.loc[
|
1213 |
+
idx_model_selection(self.equations_, self.model_selection)
|
1214 |
+
],
|
1215 |
+
)
|
1216 |
else:
|
1217 |
raise ValueError("No equations have been generated yet.")
|
1218 |
|
|
|
1354 |
ndarray,
|
1355 |
Optional[ndarray],
|
1356 |
Optional[ndarray],
|
1357 |
+
ArrayLike[str],
|
1358 |
Optional[ArrayLike[str]],
|
1359 |
Optional[Union[str, ArrayLike[str]]],
|
1360 |
]:
|
|
|
1462 |
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
1463 |
|
1464 |
def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
|
1465 |
+
raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
|
1466 |
+
return cast(Tuple[ndarray, ndarray], raw_out)
|
1467 |
|
1468 |
def _validate_data_X(self, X) -> Tuple[ndarray]:
|
1469 |
+
raw_out = self._validate_data(X=X, reset=False) # type: ignore
|
1470 |
+
return cast(Tuple[ndarray], raw_out)
|
1471 |
|
1472 |
def _pre_transform_training_data(
|
1473 |
+
self,
|
1474 |
+
X: ndarray,
|
1475 |
+
y: ndarray,
|
1476 |
+
Xresampled: Union[ndarray, None],
|
1477 |
+
variable_names: ArrayLike[str],
|
1478 |
+
X_units: Union[ArrayLike[str], None],
|
1479 |
+
y_units: Union[ArrayLike[str], str, None],
|
1480 |
+
random_state: np.random.RandomState,
|
1481 |
):
|
1482 |
"""
|
1483 |
Transform the training data before fitting the symbolic regressor.
|
|
|
1486 |
|
1487 |
Parameters
|
1488 |
----------
|
1489 |
+
X : ndarray
|
1490 |
Training data of shape (n_samples, n_features).
|
1491 |
+
y : ndarray
|
1492 |
Target values of shape (n_samples,) or (n_samples, n_targets).
|
1493 |
Will be cast to X's dtype if necessary.
|
1494 |
+
Xresampled : ndarray | None
|
1495 |
Resampled training data, of shape `(n_resampled, n_features)`,
|
1496 |
used for denoising.
|
1497 |
variable_names : list[str]
|
|
|
1529 |
"""
|
1530 |
# Feature selection transformation
|
1531 |
if self.select_k_features:
|
1532 |
+
selection_mask = run_feature_selection(
|
1533 |
X, y, self.select_k_features, random_state=random_state
|
1534 |
)
|
1535 |
+
X = X[:, selection_mask]
|
1536 |
|
1537 |
if Xresampled is not None:
|
1538 |
+
Xresampled = Xresampled[:, selection_mask]
|
1539 |
|
1540 |
# Reduce variable_names to selection
|
1541 |
+
variable_names = cast(
|
1542 |
+
ArrayLike[str], [variable_names[i] for i in selection_mask]
|
1543 |
+
)
|
1544 |
|
1545 |
if X_units is not None:
|
1546 |
+
X_units = cast(ArrayLike[str], [X_units[i] for i in selection_mask])
|
1547 |
self.X_units_ = copy.deepcopy(X_units)
|
1548 |
|
1549 |
# Re-perform data validation and feature name updating
|
1550 |
X, y = self._validate_data_X_y(X, y)
|
1551 |
# Update feature names with selected variable names
|
1552 |
+
self.selection_mask_ = selection_mask
|
1553 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
1554 |
self.display_feature_names_in_ = self.feature_names_in_
|
1555 |
print(f"Using features {self.feature_names_in_}")
|