Spaces:
Running
Running
MilesCranmer
commited on
Merge pull request #389 from MilesCranmer/backend-update-0.21.2
Browse files- docs/examples.md +89 -1
- docs/gen_param_docs.py +1 -1
- pysr/julia_helpers.py +2 -2
- {docs → pysr}/param_groupings.yml +3 -0
- pysr/sr.py +176 -36
- pysr/test/test.py +167 -0
- pysr/version.py +2 -2
- requirements.txt +1 -1
docs/examples.md
CHANGED
@@ -433,9 +433,97 @@ equal to:
|
|
433 |
$\frac{x_0^2 x_1 - 2.0000073}{x_2^2 - 1.0000019}$, which
|
434 |
is nearly the same as the true equation!
|
435 |
|
|
|
436 |
|
|
|
|
|
|
|
437 |
|
438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
|
440 |
For the many other features available in PySR, please
|
441 |
read the [Options section](options.md).
|
|
|
433 |
$\frac{x_0^2 x_1 - 2.0000073}{x_2^2 - 1.0000019}$, which
|
434 |
is nearly the same as the true equation!
|
435 |
|
436 |
+
## 10. Dimensional constraints
|
437 |
|
438 |
+
One other feature we can exploit is dimensional analysis.
|
439 |
+
Say that we know the physical units of each feature and output,
|
440 |
+
and we want to find an expression that is dimensionally consistent.
|
441 |
|
442 |
+
We can do this as follows, using `DynamicQuantities.jl` to assign units,
|
443 |
+
passing a string specifying the units for each variable.
|
444 |
+
First, let's make some data on Newton's law of gravitation, using
|
445 |
+
astropy for units:
|
446 |
+
|
447 |
+
```python
|
448 |
+
import numpy as np
|
449 |
+
from astropy import units as u, constants as const
|
450 |
+
|
451 |
+
M = (np.random.rand(100) + 0.1) * const.M_sun
|
452 |
+
m = 100 * (np.random.rand(100) + 0.1) * u.kg
|
453 |
+
r = (np.random.rand(100) + 0.1) * const.R_earth
|
454 |
+
G = const.G
|
455 |
+
|
456 |
+
F = G * M * m / r**2
|
457 |
+
```
|
458 |
+
|
459 |
+
We can see the units of `F` with `F.unit`.
|
460 |
+
|
461 |
+
Now, let's create our model.
|
462 |
+
Since this data has such a large dynamic range,
|
463 |
+
let's also create a custom loss function
|
464 |
+
that looks at the error in log-space:
|
465 |
+
|
466 |
+
```python
|
467 |
+
loss = """function loss_fnc(prediction, target)
|
468 |
+
scatter_loss = abs(log((abs(prediction)+1e-20) / (abs(target)+1e-20)))
|
469 |
+
sign_loss = 10 * (sign(prediction) - sign(target))^2
|
470 |
+
return scatter_loss + sign_loss
|
471 |
+
end
|
472 |
+
"""
|
473 |
+
```
|
474 |
+
|
475 |
+
Now let's define our model:
|
476 |
+
|
477 |
+
```python
|
478 |
+
model = PySRRegressor(
|
479 |
+
binary_operators=["+", "-", "*", "/"],
|
480 |
+
unary_operators=["square"],
|
481 |
+
loss=loss,
|
482 |
+
complexity_of_constants=2,
|
483 |
+
maxsize=25,
|
484 |
+
niterations=100,
|
485 |
+
populations=50,
|
486 |
+
# Amount to penalize dimensional violations:
|
487 |
+
dimensional_constraint_penalty=10**5,
|
488 |
+
)
|
489 |
+
```
|
490 |
+
|
491 |
+
and fit it, passing the unit information.
|
492 |
+
To do this, we need to use the format of [DynamicQuantities.jl](https://symbolicml.org/DynamicQuantities.jl/dev/#Usage).
|
493 |
+
|
494 |
+
```python
|
495 |
+
# Get numerical arrays to fit:
|
496 |
+
X = pd.DataFrame(dict(
|
497 |
+
M=M.value,
|
498 |
+
m=m.value,
|
499 |
+
r=r.value,
|
500 |
+
))
|
501 |
+
y = F.value
|
502 |
+
|
503 |
+
model.fit(
|
504 |
+
X,
|
505 |
+
y,
|
506 |
+
X_units=["Constants.M_sun", "kg", "Constants.R_earth"],
|
507 |
+
y_units="kg * m / s^2"
|
508 |
+
)
|
509 |
+
```
|
510 |
+
|
511 |
+
You can observe that all expressions with a loss under
|
512 |
+
our penalty are dimensionally consistent!
|
513 |
+
(The `"[⋅]"` indicates free units in a constant, which can cancel out other units in the expression.)
|
514 |
+
For example,
|
515 |
+
|
516 |
+
```julia
|
517 |
+
"y[m s⁻² kg] = (M[kg] * 2.6353e-22[⋅])"
|
518 |
+
```
|
519 |
+
|
520 |
+
would indicate that the expression is dimensionally consistent, with
|
521 |
+
a constant `"2.6353e-22[m s⁻²]"`.
|
522 |
+
|
523 |
+
Note that this expression has a large dynamic range so may be difficult to find. Consider searching with a larger `niterations` if needed.
|
524 |
+
|
525 |
+
|
526 |
+
## 11. Additional features
|
527 |
|
528 |
For the many other features available in PySR, please
|
529 |
read the [Options section](options.md).
|
docs/gen_param_docs.py
CHANGED
@@ -53,7 +53,7 @@ def str_param_groups(param_groupings, params, cur_heading=2):
|
|
53 |
if __name__ == "__main__":
|
54 |
# This is the path to the param_groupings.yml file
|
55 |
# relative to the current file.
|
56 |
-
path = "param_groupings.yml"
|
57 |
with open(path, "r") as f:
|
58 |
param_groupings = safe_load(f)
|
59 |
|
|
|
53 |
if __name__ == "__main__":
|
54 |
# This is the path to the param_groupings.yml file
|
55 |
# relative to the current file.
|
56 |
+
path = "../pysr/param_groupings.yml"
|
57 |
with open(path, "r") as f:
|
58 |
param_groupings = safe_load(f)
|
59 |
|
pysr/julia_helpers.py
CHANGED
@@ -259,6 +259,7 @@ def init_julia(julia_project=None, quiet=False, julia_kwargs=None, return_aux=Fa
|
|
259 |
|
260 |
def _add_sr_to_julia_project(Main, io_arg):
|
261 |
Main.eval("using Pkg")
|
|
|
262 |
Main.sr_spec = Main.PackageSpec(
|
263 |
name="SymbolicRegression",
|
264 |
url="https://github.com/MilesCranmer/SymbolicRegression.jl",
|
@@ -266,8 +267,7 @@ def _add_sr_to_julia_project(Main, io_arg):
|
|
266 |
)
|
267 |
Main.clustermanagers_spec = Main.PackageSpec(
|
268 |
name="ClusterManagers",
|
269 |
-
|
270 |
-
rev="14e7302f068794099344d5d93f71979aaf4fbeb3",
|
271 |
)
|
272 |
Main.eval(f"Pkg.add([sr_spec, clustermanagers_spec], {io_arg})")
|
273 |
|
|
|
259 |
|
260 |
def _add_sr_to_julia_project(Main, io_arg):
|
261 |
Main.eval("using Pkg")
|
262 |
+
Main.eval("Pkg.Registry.update()")
|
263 |
Main.sr_spec = Main.PackageSpec(
|
264 |
name="SymbolicRegression",
|
265 |
url="https://github.com/MilesCranmer/SymbolicRegression.jl",
|
|
|
267 |
)
|
268 |
Main.clustermanagers_spec = Main.PackageSpec(
|
269 |
name="ClusterManagers",
|
270 |
+
version="0.4",
|
|
|
271 |
)
|
272 |
Main.eval(f"Pkg.add([sr_spec, clustermanagers_spec], {io_arg})")
|
273 |
|
{docs → pysr}/param_groupings.yml
RENAMED
@@ -13,6 +13,7 @@
|
|
13 |
- loss
|
14 |
- full_objective
|
15 |
- model_selection
|
|
|
16 |
- Working with Complexities:
|
17 |
- parsimony
|
18 |
- constraints
|
@@ -72,12 +73,14 @@
|
|
72 |
- fast_cycle
|
73 |
- turbo
|
74 |
- enable_autodiff
|
|
|
75 |
- random_state
|
76 |
- deterministic
|
77 |
- warm_start
|
78 |
- Monitoring:
|
79 |
- verbosity
|
80 |
- update_verbosity
|
|
|
81 |
- progress
|
82 |
- Environment:
|
83 |
- temp_equation_file
|
|
|
13 |
- loss
|
14 |
- full_objective
|
15 |
- model_selection
|
16 |
+
- dimensional_constraint_penalty
|
17 |
- Working with Complexities:
|
18 |
- parsimony
|
19 |
- constraints
|
|
|
73 |
- fast_cycle
|
74 |
- turbo
|
75 |
- enable_autodiff
|
76 |
+
- Determinism:
|
77 |
- random_state
|
78 |
- deterministic
|
79 |
- warm_start
|
80 |
- Monitoring:
|
81 |
- verbosity
|
82 |
- update_verbosity
|
83 |
+
- print_precision
|
84 |
- progress
|
85 |
- Environment:
|
86 |
- temp_equation_file
|
pysr/sr.py
CHANGED
@@ -167,6 +167,8 @@ def _check_assertions(
|
|
167 |
variable_names,
|
168 |
weights,
|
169 |
y,
|
|
|
|
|
170 |
):
|
171 |
# Check for potential errors before they happen
|
172 |
assert len(X.shape) == 2
|
@@ -184,12 +186,30 @@ def _check_assertions(
|
|
184 |
f"Variable name {var_name} is already a function name."
|
185 |
)
|
186 |
# Check if alphanumeric only:
|
187 |
-
if not re.match(r"^[a-zA-Z0-9_]+$", var_name):
|
188 |
raise ValueError(
|
189 |
f"Invalid variable name {var_name}. "
|
190 |
"Only alphanumeric characters, numbers, "
|
191 |
"and underscores are allowed."
|
192 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
|
195 |
def best(*args, **kwargs): # pragma: no cover
|
@@ -354,6 +374,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
354 |
You may pass a function with the same arguments as this (note
|
355 |
that the name of the function doesn't matter). Here,
|
356 |
both `prediction` and `dataset.y` are 1D arrays of length `dataset.n`.
|
|
|
|
|
|
|
357 |
Default is `None`.
|
358 |
complexity_of_operators : dict[str, float]
|
359 |
If you would like to use a complexity other than 1 for an
|
@@ -371,6 +394,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
371 |
parsimony : float
|
372 |
Multiplicative factor for how much to punish complexity.
|
373 |
Default is `0.0032`.
|
|
|
|
|
|
|
374 |
use_frequency : bool
|
375 |
Whether to measure the frequency of complexities, and use that
|
376 |
instead of parsimony to explore equation space. Will naturally
|
@@ -551,6 +577,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
551 |
What verbosity level to use for package updates.
|
552 |
Will take value of `verbosity` if not given.
|
553 |
Default is `None`.
|
|
|
|
|
554 |
progress : bool
|
555 |
Whether to use a progress bar instead of printing to stdout.
|
556 |
Default is `True`.
|
@@ -633,6 +661,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
633 |
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
634 |
Names of features seen during :term:`fit`. Defined only when `X`
|
635 |
has feature names that are all strings.
|
|
|
|
|
|
|
|
|
|
|
|
|
636 |
nout_ : int
|
637 |
Number of output dimensions.
|
638 |
selection_mask_ : list[int] of length `select_k_features`
|
@@ -712,6 +746,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
712 |
complexity_of_constants=1,
|
713 |
complexity_of_variables=1,
|
714 |
parsimony=0.0032,
|
|
|
715 |
use_frequency=True,
|
716 |
use_frequency_in_tournament=True,
|
717 |
adaptive_parsimony_scaling=20.0,
|
@@ -758,6 +793,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
758 |
warm_start=False,
|
759 |
verbosity=1e9,
|
760 |
update_verbosity=None,
|
|
|
761 |
progress=True,
|
762 |
equation_file=None,
|
763 |
temp_equation_file=False,
|
@@ -802,6 +838,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
802 |
self.complexity_of_constants = complexity_of_constants
|
803 |
self.complexity_of_variables = complexity_of_variables
|
804 |
self.parsimony = parsimony
|
|
|
805 |
self.use_frequency = use_frequency
|
806 |
self.use_frequency_in_tournament = use_frequency_in_tournament
|
807 |
self.adaptive_parsimony_scaling = adaptive_parsimony_scaling
|
@@ -853,6 +890,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
853 |
# - Runtime user interface
|
854 |
self.verbosity = verbosity
|
855 |
self.update_verbosity = update_verbosity
|
|
|
856 |
self.progress = progress
|
857 |
# - Project management
|
858 |
self.equation_file = equation_file
|
@@ -976,11 +1014,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
976 |
|
977 |
# Else, we re-create it.
|
978 |
print(
|
979 |
-
f"{
|
980 |
"so we must create the model from scratch."
|
981 |
)
|
982 |
-
assert binary_operators is not None
|
983 |
-
assert unary_operators is not None
|
984 |
assert n_features_in is not None
|
985 |
|
986 |
# TODO: copy .bkup file if exists.
|
@@ -995,10 +1032,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
995 |
model.n_features_in_ = n_features_in
|
996 |
|
997 |
if feature_names_in is None:
|
998 |
-
model.feature_names_in_ = [f"x{i}" for i in range(n_features_in)]
|
|
|
|
|
|
|
999 |
else:
|
1000 |
assert len(feature_names_in) == n_features_in
|
1001 |
model.feature_names_in_ = feature_names_in
|
|
|
1002 |
|
1003 |
if selection_mask is None:
|
1004 |
model.selection_mask_ = np.ones(n_features_in, dtype=bool)
|
@@ -1318,7 +1359,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1318 |
|
1319 |
return packed_modified_params
|
1320 |
|
1321 |
-
def _validate_and_set_fit_params(
|
|
|
|
|
1322 |
"""
|
1323 |
Validate the parameters passed to the :term`fit` method.
|
1324 |
|
@@ -1340,6 +1383,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1340 |
for that particular element of y.
|
1341 |
variable_names : list[str] of length n_features
|
1342 |
Names of each variable in the training dataset, `X`.
|
|
|
|
|
|
|
|
|
1343 |
|
1344 |
Returns
|
1345 |
-------
|
@@ -1351,6 +1398,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1351 |
Validated resampled training data used for denoising.
|
1352 |
variable_names_validated : list[str] of length n_features
|
1353 |
Validated list of variable names for each feature in `X`.
|
|
|
|
|
|
|
|
|
1354 |
|
1355 |
"""
|
1356 |
if isinstance(X, pd.DataFrame):
|
@@ -1361,7 +1412,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1361 |
"Using DataFrame column names instead."
|
1362 |
)
|
1363 |
|
1364 |
-
if
|
|
|
|
|
|
|
1365 |
X.columns = X.columns.str.replace(" ", "_")
|
1366 |
warnings.warn(
|
1367 |
"Spaces in DataFrame column names are not supported. "
|
@@ -1384,7 +1438,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1384 |
weights = check_array(weights, ensure_2d=False)
|
1385 |
check_consistent_length(weights, y)
|
1386 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
1387 |
-
self.feature_names_in_ = _check_feature_names_in(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1388 |
variable_names = self.feature_names_in_
|
1389 |
|
1390 |
# Handle multioutput data
|
@@ -1395,10 +1460,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1395 |
else:
|
1396 |
raise NotImplementedError("y shape not supported!")
|
1397 |
|
1398 |
-
|
|
|
|
|
|
|
1399 |
|
1400 |
def _pre_transform_training_data(
|
1401 |
-
self, X, y, Xresampled, variable_names, random_state
|
1402 |
):
|
1403 |
"""
|
1404 |
Transform the training data before fitting the symbolic regressor.
|
@@ -1418,6 +1486,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1418 |
variable_names : list[str]
|
1419 |
Names of each variable in the training dataset, `X`.
|
1420 |
Of length `n_features`.
|
|
|
|
|
|
|
|
|
1421 |
random_state : int | np.RandomState
|
1422 |
Pass an int for reproducible results across multiple function calls.
|
1423 |
See :term:`Glossary <random_state>`. Default is `None`.
|
@@ -1439,6 +1511,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1439 |
variable_names_transformed : list[str] of length n_features
|
1440 |
Names of each variable in the transformed dataset,
|
1441 |
`X_transformed`.
|
|
|
|
|
|
|
|
|
1442 |
"""
|
1443 |
# Feature selection transformation
|
1444 |
if self.select_k_features:
|
@@ -1453,10 +1529,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1453 |
# Reduce variable_names to selection
|
1454 |
variable_names = [variable_names[i] for i in self.selection_mask_]
|
1455 |
|
|
|
|
|
|
|
|
|
1456 |
# Re-perform data validation and feature name updating
|
1457 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
1458 |
# Update feature names with selected variable names
|
1459 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
|
|
1460 |
print(f"Using features {self.feature_names_in_}")
|
1461 |
|
1462 |
# Denoising transformation
|
@@ -1476,7 +1557,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1476 |
else:
|
1477 |
X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
1478 |
|
1479 |
-
return X, y, variable_names
|
1480 |
|
1481 |
def _run(self, X, y, mutated_params, weights, seed):
|
1482 |
"""
|
@@ -1629,6 +1710,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1629 |
tournament_selection_n=self.tournament_selection_n,
|
1630 |
# These have the same name:
|
1631 |
parsimony=self.parsimony,
|
|
|
1632 |
alpha=self.alpha,
|
1633 |
maxdepth=maxdepth,
|
1634 |
fast_cycle=self.fast_cycle,
|
@@ -1648,6 +1730,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1648 |
fraction_replaced=self.fraction_replaced,
|
1649 |
topn=self.topn,
|
1650 |
verbosity=self.verbosity,
|
|
|
1651 |
optimizer_algorithm=self.optimizer_algorithm,
|
1652 |
optimizer_nrestarts=self.optimizer_nrestarts,
|
1653 |
optimizer_probability=self.optimize_probability,
|
@@ -1699,6 +1782,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1699 |
None if parallelism in ["serial", "multithreading"] else int(self.procs)
|
1700 |
)
|
1701 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1702 |
# Call to Julia backend.
|
1703 |
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl
|
1704 |
self.raw_julia_state_ = SymbolicRegression.equation_search(
|
@@ -1706,7 +1795,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1706 |
Main.y,
|
1707 |
weights=Main.weights,
|
1708 |
niterations=int(self.niterations),
|
1709 |
-
variable_names=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1710 |
options=options,
|
1711 |
numprocs=cprocs,
|
1712 |
parallelism=parallelism,
|
@@ -1732,6 +1829,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1732 |
Xresampled=None,
|
1733 |
weights=None,
|
1734 |
variable_names=None,
|
|
|
|
|
1735 |
):
|
1736 |
"""
|
1737 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
@@ -1759,6 +1858,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1759 |
instead of `variable_names`. Cannot contain spaces or special
|
1760 |
characters. Avoid variable names which are also
|
1761 |
function names in `sympy`, such as "N".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1762 |
|
1763 |
Returns
|
1764 |
-------
|
@@ -1780,6 +1888,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1780 |
self.nout_ = 1
|
1781 |
self.selection_mask_ = None
|
1782 |
self.raw_julia_state_ = None
|
|
|
|
|
1783 |
|
1784 |
random_state = check_random_state(self.random_state) # For np random
|
1785 |
seed = random_state.get_state()[1][0] # For julia random
|
@@ -1788,8 +1898,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1788 |
|
1789 |
mutated_params = self._validate_and_set_init_params()
|
1790 |
|
1791 |
-
|
1792 |
-
X,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1793 |
)
|
1794 |
|
1795 |
if X.shape[0] > 10000 and not self.batching:
|
@@ -1804,8 +1922,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1804 |
)
|
1805 |
|
1806 |
# Pre transformations (feature selection and denoising)
|
1807 |
-
X, y, variable_names = self._pre_transform_training_data(
|
1808 |
-
X, y, Xresampled, variable_names, random_state
|
1809 |
)
|
1810 |
|
1811 |
# Warn about large feature counts (still warn if feature count is large
|
@@ -1834,6 +1952,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1834 |
variable_names,
|
1835 |
weights,
|
1836 |
y,
|
|
|
|
|
1837 |
)
|
1838 |
|
1839 |
# Initially, just save model parameters, so that
|
@@ -2072,17 +2192,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2072 |
with open(cur_filename, "r") as f:
|
2073 |
buf = f.read()
|
2074 |
buf = _preprocess_julia_floats(buf)
|
2075 |
-
|
2076 |
-
|
2077 |
-
# Rename Complexity column to complexity:
|
2078 |
-
df.rename(
|
2079 |
-
columns={
|
2080 |
-
"Complexity": "complexity",
|
2081 |
-
"Loss": "loss",
|
2082 |
-
"Equation": "equation",
|
2083 |
-
},
|
2084 |
-
inplace=True,
|
2085 |
-
)
|
2086 |
|
2087 |
all_outputs.append(df)
|
2088 |
else:
|
@@ -2092,15 +2203,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2092 |
with open(filename, "r") as f:
|
2093 |
buf = f.read()
|
2094 |
buf = _preprocess_julia_floats(buf)
|
2095 |
-
all_outputs = [pd.read_csv(StringIO(buf))]
|
2096 |
-
all_outputs[-1].rename(
|
2097 |
-
columns={
|
2098 |
-
"Complexity": "complexity",
|
2099 |
-
"Loss": "loss",
|
2100 |
-
"Equation": "equation",
|
2101 |
-
},
|
2102 |
-
inplace=True,
|
2103 |
-
)
|
2104 |
|
2105 |
except FileNotFoundError:
|
2106 |
raise RuntimeError(
|
@@ -2109,6 +2212,35 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2109 |
)
|
2110 |
return all_outputs
|
2111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2112 |
def get_hof(self):
|
2113 |
"""Get the equations from a hall of fame file.
|
2114 |
|
@@ -2409,3 +2541,11 @@ def _preprocess_julia_floats(s: str) -> str:
|
|
2409 |
s = _apply_regexp_im_sci(s)
|
2410 |
s = _apply_regexp_sci(s)
|
2411 |
return s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
variable_names,
|
168 |
weights,
|
169 |
y,
|
170 |
+
X_units,
|
171 |
+
y_units,
|
172 |
):
|
173 |
# Check for potential errors before they happen
|
174 |
assert len(X.shape) == 2
|
|
|
186 |
f"Variable name {var_name} is already a function name."
|
187 |
)
|
188 |
# Check if alphanumeric only:
|
189 |
+
if not re.match(r"^[₀₁₂₃₄₅₆₇₈₉a-zA-Z0-9_]+$", var_name):
|
190 |
raise ValueError(
|
191 |
f"Invalid variable name {var_name}. "
|
192 |
"Only alphanumeric characters, numbers, "
|
193 |
"and underscores are allowed."
|
194 |
)
|
195 |
+
if X_units is not None and len(X_units) != X.shape[1]:
|
196 |
+
raise ValueError(
|
197 |
+
"The number of units in `X_units` must equal the number of features in `X`."
|
198 |
+
)
|
199 |
+
if y_units is not None:
|
200 |
+
good_y_units = False
|
201 |
+
if isinstance(y_units, list):
|
202 |
+
if len(y.shape) == 1:
|
203 |
+
good_y_units = len(y_units) == 1
|
204 |
+
else:
|
205 |
+
good_y_units = len(y_units) == y.shape[1]
|
206 |
+
else:
|
207 |
+
good_y_units = len(y.shape) == 1 or y.shape[1] == 1
|
208 |
+
|
209 |
+
if not good_y_units:
|
210 |
+
raise ValueError(
|
211 |
+
"The number of units in `y_units` must equal the number of output features in `y`."
|
212 |
+
)
|
213 |
|
214 |
|
215 |
def best(*args, **kwargs): # pragma: no cover
|
|
|
374 |
You may pass a function with the same arguments as this (note
|
375 |
that the name of the function doesn't matter). Here,
|
376 |
both `prediction` and `dataset.y` are 1D arrays of length `dataset.n`.
|
377 |
+
If using `batching`, then you should add an
|
378 |
+
`idx` argument to the function, which is `nothing`
|
379 |
+
for non-batched, and a 1D array of indices for batched.
|
380 |
Default is `None`.
|
381 |
complexity_of_operators : dict[str, float]
|
382 |
If you would like to use a complexity other than 1 for an
|
|
|
394 |
parsimony : float
|
395 |
Multiplicative factor for how much to punish complexity.
|
396 |
Default is `0.0032`.
|
397 |
+
dimensional_constraint_penalty : float
|
398 |
+
Additive penalty for if dimensional analysis of an expression fails.
|
399 |
+
By default, this is `1000.0`.
|
400 |
use_frequency : bool
|
401 |
Whether to measure the frequency of complexities, and use that
|
402 |
instead of parsimony to explore equation space. Will naturally
|
|
|
577 |
What verbosity level to use for package updates.
|
578 |
Will take value of `verbosity` if not given.
|
579 |
Default is `None`.
|
580 |
+
print_precision : int
|
581 |
+
How many significant digits to print for floats. Default is `5`.
|
582 |
progress : bool
|
583 |
Whether to use a progress bar instead of printing to stdout.
|
584 |
Default is `True`.
|
|
|
661 |
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
662 |
Names of features seen during :term:`fit`. Defined only when `X`
|
663 |
has feature names that are all strings.
|
664 |
+
pretty_feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
665 |
+
Pretty names of features, used only during printing.
|
666 |
+
X_units_ : list[str] of length n_features
|
667 |
+
Units of each variable in the training dataset, `X`.
|
668 |
+
y_units_ : str | list[str] of length n_out
|
669 |
+
Units of each variable in the training dataset, `y`.
|
670 |
nout_ : int
|
671 |
Number of output dimensions.
|
672 |
selection_mask_ : list[int] of length `select_k_features`
|
|
|
746 |
complexity_of_constants=1,
|
747 |
complexity_of_variables=1,
|
748 |
parsimony=0.0032,
|
749 |
+
dimensional_constraint_penalty=None,
|
750 |
use_frequency=True,
|
751 |
use_frequency_in_tournament=True,
|
752 |
adaptive_parsimony_scaling=20.0,
|
|
|
793 |
warm_start=False,
|
794 |
verbosity=1e9,
|
795 |
update_verbosity=None,
|
796 |
+
print_precision=5,
|
797 |
progress=True,
|
798 |
equation_file=None,
|
799 |
temp_equation_file=False,
|
|
|
838 |
self.complexity_of_constants = complexity_of_constants
|
839 |
self.complexity_of_variables = complexity_of_variables
|
840 |
self.parsimony = parsimony
|
841 |
+
self.dimensional_constraint_penalty = dimensional_constraint_penalty
|
842 |
self.use_frequency = use_frequency
|
843 |
self.use_frequency_in_tournament = use_frequency_in_tournament
|
844 |
self.adaptive_parsimony_scaling = adaptive_parsimony_scaling
|
|
|
890 |
# - Runtime user interface
|
891 |
self.verbosity = verbosity
|
892 |
self.update_verbosity = update_verbosity
|
893 |
+
self.print_precision = print_precision
|
894 |
self.progress = progress
|
895 |
# - Project management
|
896 |
self.equation_file = equation_file
|
|
|
1014 |
|
1015 |
# Else, we re-create it.
|
1016 |
print(
|
1017 |
+
f"{pkl_filename} does not exist, "
|
1018 |
"so we must create the model from scratch."
|
1019 |
)
|
1020 |
+
assert binary_operators is not None or unary_operators is not None
|
|
|
1021 |
assert n_features_in is not None
|
1022 |
|
1023 |
# TODO: copy .bkup file if exists.
|
|
|
1032 |
model.n_features_in_ = n_features_in
|
1033 |
|
1034 |
if feature_names_in is None:
|
1035 |
+
model.feature_names_in_ = np.array([f"x{i}" for i in range(n_features_in)])
|
1036 |
+
model.pretty_feature_names_in_ = np.array(
|
1037 |
+
[f"x{_subscriptify(i)}" for i in range(n_features_in)]
|
1038 |
+
)
|
1039 |
else:
|
1040 |
assert len(feature_names_in) == n_features_in
|
1041 |
model.feature_names_in_ = feature_names_in
|
1042 |
+
model.pretty_feature_names_in_ = None
|
1043 |
|
1044 |
if selection_mask is None:
|
1045 |
model.selection_mask_ = np.ones(n_features_in, dtype=bool)
|
|
|
1359 |
|
1360 |
return packed_modified_params
|
1361 |
|
1362 |
+
def _validate_and_set_fit_params(
|
1363 |
+
self, X, y, Xresampled, weights, variable_names, X_units, y_units
|
1364 |
+
):
|
1365 |
"""
|
1366 |
Validate the parameters passed to the :term`fit` method.
|
1367 |
|
|
|
1383 |
for that particular element of y.
|
1384 |
variable_names : list[str] of length n_features
|
1385 |
Names of each variable in the training dataset, `X`.
|
1386 |
+
X_units : list[str] of length n_features
|
1387 |
+
Units of each variable in the training dataset, `X`.
|
1388 |
+
y_units : str | list[str] of length n_out
|
1389 |
+
Units of each variable in the training dataset, `y`.
|
1390 |
|
1391 |
Returns
|
1392 |
-------
|
|
|
1398 |
Validated resampled training data used for denoising.
|
1399 |
variable_names_validated : list[str] of length n_features
|
1400 |
Validated list of variable names for each feature in `X`.
|
1401 |
+
X_units : list[str] of length n_features
|
1402 |
+
Validated units for `X`.
|
1403 |
+
y_units : str | list[str] of length n_out
|
1404 |
+
Validated units for `y`.
|
1405 |
|
1406 |
"""
|
1407 |
if isinstance(X, pd.DataFrame):
|
|
|
1412 |
"Using DataFrame column names instead."
|
1413 |
)
|
1414 |
|
1415 |
+
if (
|
1416 |
+
pd.api.types.is_object_dtype(X.columns)
|
1417 |
+
and X.columns.str.contains(" ").any()
|
1418 |
+
):
|
1419 |
X.columns = X.columns.str.replace(" ", "_")
|
1420 |
warnings.warn(
|
1421 |
"Spaces in DataFrame column names are not supported. "
|
|
|
1438 |
weights = check_array(weights, ensure_2d=False)
|
1439 |
check_consistent_length(weights, y)
|
1440 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
1441 |
+
self.feature_names_in_ = _check_feature_names_in(
|
1442 |
+
self, variable_names, generate_names=False
|
1443 |
+
)
|
1444 |
+
|
1445 |
+
if self.feature_names_in_ is None:
|
1446 |
+
self.feature_names_in_ = np.array([f"x{i}" for i in range(X.shape[1])])
|
1447 |
+
self.pretty_feature_names_in_ = np.array(
|
1448 |
+
[f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
1449 |
+
)
|
1450 |
+
else:
|
1451 |
+
self.pretty_feature_names_in_ = None
|
1452 |
+
|
1453 |
variable_names = self.feature_names_in_
|
1454 |
|
1455 |
# Handle multioutput data
|
|
|
1460 |
else:
|
1461 |
raise NotImplementedError("y shape not supported!")
|
1462 |
|
1463 |
+
self.X_units_ = copy.deepcopy(X_units)
|
1464 |
+
self.y_units_ = copy.deepcopy(y_units)
|
1465 |
+
|
1466 |
+
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
1467 |
|
1468 |
def _pre_transform_training_data(
|
1469 |
+
self, X, y, Xresampled, variable_names, X_units, y_units, random_state
|
1470 |
):
|
1471 |
"""
|
1472 |
Transform the training data before fitting the symbolic regressor.
|
|
|
1486 |
variable_names : list[str]
|
1487 |
Names of each variable in the training dataset, `X`.
|
1488 |
Of length `n_features`.
|
1489 |
+
X_units : list[str]
|
1490 |
+
Units of each variable in the training dataset, `X`.
|
1491 |
+
y_units : str | list[str]
|
1492 |
+
Units of each variable in the training dataset, `y`.
|
1493 |
random_state : int | np.RandomState
|
1494 |
Pass an int for reproducible results across multiple function calls.
|
1495 |
See :term:`Glossary <random_state>`. Default is `None`.
|
|
|
1511 |
variable_names_transformed : list[str] of length n_features
|
1512 |
Names of each variable in the transformed dataset,
|
1513 |
`X_transformed`.
|
1514 |
+
X_units_transformed : list[str] of length n_features
|
1515 |
+
Units of each variable in the transformed dataset.
|
1516 |
+
y_units_transformed : str | list[str] of length n_out
|
1517 |
+
Units of each variable in the transformed dataset.
|
1518 |
"""
|
1519 |
# Feature selection transformation
|
1520 |
if self.select_k_features:
|
|
|
1529 |
# Reduce variable_names to selection
|
1530 |
variable_names = [variable_names[i] for i in self.selection_mask_]
|
1531 |
|
1532 |
+
if X_units is not None:
|
1533 |
+
X_units = [X_units[i] for i in self.selection_mask_]
|
1534 |
+
self.X_units_ = copy.deepcopy(X_units)
|
1535 |
+
|
1536 |
# Re-perform data validation and feature name updating
|
1537 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
1538 |
# Update feature names with selected variable names
|
1539 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
1540 |
+
self.pretty_feature_names_in_ = None
|
1541 |
print(f"Using features {self.feature_names_in_}")
|
1542 |
|
1543 |
# Denoising transformation
|
|
|
1557 |
else:
|
1558 |
X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
1559 |
|
1560 |
+
return X, y, variable_names, X_units, y_units
|
1561 |
|
1562 |
def _run(self, X, y, mutated_params, weights, seed):
|
1563 |
"""
|
|
|
1710 |
tournament_selection_n=self.tournament_selection_n,
|
1711 |
# These have the same name:
|
1712 |
parsimony=self.parsimony,
|
1713 |
+
dimensional_constraint_penalty=self.dimensional_constraint_penalty,
|
1714 |
alpha=self.alpha,
|
1715 |
maxdepth=maxdepth,
|
1716 |
fast_cycle=self.fast_cycle,
|
|
|
1730 |
fraction_replaced=self.fraction_replaced,
|
1731 |
topn=self.topn,
|
1732 |
verbosity=self.verbosity,
|
1733 |
+
print_precision=self.print_precision,
|
1734 |
optimizer_algorithm=self.optimizer_algorithm,
|
1735 |
optimizer_nrestarts=self.optimizer_nrestarts,
|
1736 |
optimizer_probability=self.optimize_probability,
|
|
|
1782 |
None if parallelism in ["serial", "multithreading"] else int(self.procs)
|
1783 |
)
|
1784 |
|
1785 |
+
y_variable_names = None
|
1786 |
+
if len(y.shape) > 1:
|
1787 |
+
# We set these manually so that they respect Python's 0 indexing
|
1788 |
+
# (by default Julia will use y1, y2...)
|
1789 |
+
y_variable_names = [f"y{_subscriptify(i)}" for i in range(y.shape[1])]
|
1790 |
+
|
1791 |
# Call to Julia backend.
|
1792 |
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl
|
1793 |
self.raw_julia_state_ = SymbolicRegression.equation_search(
|
|
|
1795 |
Main.y,
|
1796 |
weights=Main.weights,
|
1797 |
niterations=int(self.niterations),
|
1798 |
+
variable_names=(
|
1799 |
+
self.pretty_feature_names_in_.tolist()
|
1800 |
+
if hasattr(self, "pretty_feature_names_in_")
|
1801 |
+
and self.pretty_feature_names_in_ is not None
|
1802 |
+
else self.feature_names_in_.tolist()
|
1803 |
+
),
|
1804 |
+
y_variable_names=y_variable_names,
|
1805 |
+
X_units=self.X_units_,
|
1806 |
+
y_units=self.y_units_,
|
1807 |
options=options,
|
1808 |
numprocs=cprocs,
|
1809 |
parallelism=parallelism,
|
|
|
1829 |
Xresampled=None,
|
1830 |
weights=None,
|
1831 |
variable_names=None,
|
1832 |
+
X_units=None,
|
1833 |
+
y_units=None,
|
1834 |
):
|
1835 |
"""
|
1836 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
|
1858 |
instead of `variable_names`. Cannot contain spaces or special
|
1859 |
characters. Avoid variable names which are also
|
1860 |
function names in `sympy`, such as "N".
|
1861 |
+
X_units : list[str]
|
1862 |
+
A list of units for each variable in `X`. Each unit should be
|
1863 |
+
a string representing a Julia expression. See DynamicQuantities.jl
|
1864 |
+
https://symbolicml.org/DynamicQuantities.jl/dev/units/ for more
|
1865 |
+
information.
|
1866 |
+
y_units : str | list[str]
|
1867 |
+
Similar to `X_units`, but as a unit for the target variable, `y`.
|
1868 |
+
If `y` is a matrix, a list of units should be passed. If `X_units`
|
1869 |
+
is given but `y_units` is not, then `y_units` will be arbitrary.
|
1870 |
|
1871 |
Returns
|
1872 |
-------
|
|
|
1888 |
self.nout_ = 1
|
1889 |
self.selection_mask_ = None
|
1890 |
self.raw_julia_state_ = None
|
1891 |
+
self.X_units_ = None
|
1892 |
+
self.y_units_ = None
|
1893 |
|
1894 |
random_state = check_random_state(self.random_state) # For np random
|
1895 |
seed = random_state.get_state()[1][0] # For julia random
|
|
|
1898 |
|
1899 |
mutated_params = self._validate_and_set_init_params()
|
1900 |
|
1901 |
+
(
|
1902 |
+
X,
|
1903 |
+
y,
|
1904 |
+
Xresampled,
|
1905 |
+
weights,
|
1906 |
+
variable_names,
|
1907 |
+
X_units,
|
1908 |
+
y_units,
|
1909 |
+
) = self._validate_and_set_fit_params(
|
1910 |
+
X, y, Xresampled, weights, variable_names, X_units, y_units
|
1911 |
)
|
1912 |
|
1913 |
if X.shape[0] > 10000 and not self.batching:
|
|
|
1922 |
)
|
1923 |
|
1924 |
# Pre transformations (feature selection and denoising)
|
1925 |
+
X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
|
1926 |
+
X, y, Xresampled, variable_names, X_units, y_units, random_state
|
1927 |
)
|
1928 |
|
1929 |
# Warn about large feature counts (still warn if feature count is large
|
|
|
1952 |
variable_names,
|
1953 |
weights,
|
1954 |
y,
|
1955 |
+
X_units,
|
1956 |
+
y_units,
|
1957 |
)
|
1958 |
|
1959 |
# Initially, just save model parameters, so that
|
|
|
2192 |
with open(cur_filename, "r") as f:
|
2193 |
buf = f.read()
|
2194 |
buf = _preprocess_julia_floats(buf)
|
2195 |
+
|
2196 |
+
df = self._postprocess_dataframe(pd.read_csv(StringIO(buf)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2197 |
|
2198 |
all_outputs.append(df)
|
2199 |
else:
|
|
|
2203 |
with open(filename, "r") as f:
|
2204 |
buf = f.read()
|
2205 |
buf = _preprocess_julia_floats(buf)
|
2206 |
+
all_outputs = [self._postprocess_dataframe(pd.read_csv(StringIO(buf)))]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2207 |
|
2208 |
except FileNotFoundError:
|
2209 |
raise RuntimeError(
|
|
|
2212 |
)
|
2213 |
return all_outputs
|
2214 |
|
2215 |
+
def _postprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
2216 |
+
df = df.rename(
|
2217 |
+
columns={
|
2218 |
+
"Complexity": "complexity",
|
2219 |
+
"Loss": "loss",
|
2220 |
+
"Equation": "equation",
|
2221 |
+
},
|
2222 |
+
)
|
2223 |
+
# Regexp replace x₁₂₃ to x123 in `equation`:
|
2224 |
+
if (
|
2225 |
+
hasattr(self, "pretty_feature_names_in_")
|
2226 |
+
and self.pretty_feature_names_in_ is not None
|
2227 |
+
):
|
2228 |
+
# df["equation"] = df["equation"].apply(_undo_subscriptify_full)
|
2229 |
+
for pname, name in zip(
|
2230 |
+
self.pretty_feature_names_in_, self.feature_names_in_
|
2231 |
+
):
|
2232 |
+
df["equation"] = df["equation"].apply(
|
2233 |
+
lambda s: re.sub(
|
2234 |
+
r"\b" + f"({pname})" + r"\b",
|
2235 |
+
name,
|
2236 |
+
s,
|
2237 |
+
)
|
2238 |
+
if isinstance(s, str)
|
2239 |
+
else s
|
2240 |
+
)
|
2241 |
+
|
2242 |
+
return df
|
2243 |
+
|
2244 |
def get_hof(self):
|
2245 |
"""Get the equations from a hall of fame file.
|
2246 |
|
|
|
2541 |
s = _apply_regexp_im_sci(s)
|
2542 |
s = _apply_regexp_sci(s)
|
2543 |
return s
|
2544 |
+
|
2545 |
+
|
2546 |
+
def _subscriptify(i: int) -> str:
|
2547 |
+
"""Converts integer to subscript text form.
|
2548 |
+
|
2549 |
+
For example, 123 -> "₁₂₃".
|
2550 |
+
"""
|
2551 |
+
return "".join([chr(0x2080 + int(c)) for c in str(i)])
|
pysr/test/test.py
CHANGED
@@ -19,6 +19,7 @@ from ..sr import (
|
|
19 |
_handle_feature_selection,
|
20 |
_csv_filename_to_pkl_filename,
|
21 |
idx_model_selection,
|
|
|
22 |
)
|
23 |
from ..export_latex import to_latex
|
24 |
|
@@ -711,6 +712,26 @@ class TestMiscellaneous(unittest.TestCase):
|
|
711 |
# If any checks failed don't let the test pass.
|
712 |
self.assertEqual(len(exception_messages), 0)
|
713 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
714 |
|
715 |
TRUE_PREAMBLE = "\n".join(
|
716 |
[
|
@@ -906,6 +927,151 @@ class TestLaTeXTable(unittest.TestCase):
|
|
906 |
self.assertEqual(latex_table_str, true_latex_table_str)
|
907 |
|
908 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
909 |
def runtests():
|
910 |
"""Run all tests in test.py."""
|
911 |
suite = unittest.TestSuite()
|
@@ -916,6 +1082,7 @@ def runtests():
|
|
916 |
TestFeatureSelection,
|
917 |
TestMiscellaneous,
|
918 |
TestLaTeXTable,
|
|
|
919 |
]
|
920 |
for test_case in test_cases:
|
921 |
tests = loader.loadTestsFromTestCase(test_case)
|
|
|
19 |
_handle_feature_selection,
|
20 |
_csv_filename_to_pkl_filename,
|
21 |
idx_model_selection,
|
22 |
+
_check_assertions,
|
23 |
)
|
24 |
from ..export_latex import to_latex
|
25 |
|
|
|
712 |
# If any checks failed don't let the test pass.
|
713 |
self.assertEqual(len(exception_messages), 0)
|
714 |
|
715 |
+
def test_param_groupings(self):
|
716 |
+
"""Test that param_groupings are complete"""
|
717 |
+
param_groupings_file = Path(__file__).parent.parent / "param_groupings.yml"
|
718 |
+
# Read the file, discarding lines ending in ":",
|
719 |
+
# and removing leading "\s*-\s*":
|
720 |
+
params = []
|
721 |
+
with open(param_groupings_file, "r") as f:
|
722 |
+
for line in f.readlines():
|
723 |
+
if line.strip().endswith(":"):
|
724 |
+
continue
|
725 |
+
if line.strip().startswith("-"):
|
726 |
+
params.append(line.strip()[1:].strip())
|
727 |
+
|
728 |
+
regressor_params = [
|
729 |
+
p for p in DEFAULT_PARAMS.keys() if p not in ["self", "kwargs"]
|
730 |
+
]
|
731 |
+
|
732 |
+
# Check the sets are equal:
|
733 |
+
self.assertSetEqual(set(params), set(regressor_params))
|
734 |
+
|
735 |
|
736 |
TRUE_PREAMBLE = "\n".join(
|
737 |
[
|
|
|
927 |
self.assertEqual(latex_table_str, true_latex_table_str)
|
928 |
|
929 |
|
930 |
+
class TestDimensionalConstraints(unittest.TestCase):
|
931 |
+
def setUp(self):
|
932 |
+
self.default_test_kwargs = dict(
|
933 |
+
progress=False,
|
934 |
+
model_selection="accuracy",
|
935 |
+
niterations=DEFAULT_NITERATIONS * 2,
|
936 |
+
populations=DEFAULT_POPULATIONS * 2,
|
937 |
+
temp_equation_file=True,
|
938 |
+
)
|
939 |
+
self.rstate = np.random.RandomState(0)
|
940 |
+
self.X = self.rstate.randn(100, 5)
|
941 |
+
|
942 |
+
def test_dimensional_constraints(self):
|
943 |
+
y = np.cos(self.X[:, [0, 1]])
|
944 |
+
model = PySRRegressor(
|
945 |
+
binary_operators=[
|
946 |
+
"my_add(x, y) = x + y",
|
947 |
+
"my_sub(x, y) = x - y",
|
948 |
+
"my_mul(x, y) = x * y",
|
949 |
+
],
|
950 |
+
unary_operators=["my_cos(x) = cos(x)"],
|
951 |
+
**self.default_test_kwargs,
|
952 |
+
early_stop_condition=1e-8,
|
953 |
+
select_k_features=3,
|
954 |
+
extra_sympy_mappings={
|
955 |
+
"my_cos": sympy.cos,
|
956 |
+
"my_add": lambda x, y: x + y,
|
957 |
+
"my_sub": lambda x, y: x - y,
|
958 |
+
"my_mul": lambda x, y: x * y,
|
959 |
+
},
|
960 |
+
)
|
961 |
+
model.fit(self.X, y, X_units=["m", "m", "m", "m", "m"], y_units=["m", "m"])
|
962 |
+
|
963 |
+
# The best expression should have complexity larger than just 2:
|
964 |
+
for i in range(2):
|
965 |
+
self.assertGreater(model.get_best()[i]["complexity"], 2)
|
966 |
+
self.assertLess(model.get_best()[i]["loss"], 1e-6)
|
967 |
+
self.assertGreater(
|
968 |
+
model.equations_[i].query("complexity <= 2").loss.min(), 1e-6
|
969 |
+
)
|
970 |
+
|
971 |
+
def test_unit_checks(self):
|
972 |
+
"""This just checks the number of units passed"""
|
973 |
+
use_custom_variable_names = False
|
974 |
+
variable_names = None
|
975 |
+
weights = None
|
976 |
+
args = (use_custom_variable_names, variable_names, weights)
|
977 |
+
valid_units = [
|
978 |
+
(np.ones((10, 2)), np.ones(10), ["m/s", "s"], "m"),
|
979 |
+
(np.ones((10, 1)), np.ones(10), ["m/s"], None),
|
980 |
+
(np.ones((10, 1)), np.ones(10), None, "m/s"),
|
981 |
+
(np.ones((10, 1)), np.ones(10), None, ["m/s"]),
|
982 |
+
(np.ones((10, 1)), np.ones((10, 1)), None, ["m/s"]),
|
983 |
+
(np.ones((10, 1)), np.ones((10, 2)), None, ["m/s", ""]),
|
984 |
+
]
|
985 |
+
for X, y, X_units, y_units in valid_units:
|
986 |
+
_check_assertions(
|
987 |
+
X,
|
988 |
+
*args,
|
989 |
+
y,
|
990 |
+
X_units,
|
991 |
+
y_units,
|
992 |
+
)
|
993 |
+
invalid_units = [
|
994 |
+
(np.ones((10, 2)), np.ones(10), ["m/s", "s", "s^2"], None),
|
995 |
+
(np.ones((10, 2)), np.ones(10), ["m/s", "s", "s^2"], "m"),
|
996 |
+
(np.ones((10, 2)), np.ones((10, 2)), ["m/s", "s"], ["m"]),
|
997 |
+
(np.ones((10, 1)), np.ones((10, 1)), "m/s", ["m"]),
|
998 |
+
]
|
999 |
+
for X, y, X_units, y_units in invalid_units:
|
1000 |
+
with self.assertRaises(ValueError):
|
1001 |
+
_check_assertions(
|
1002 |
+
X,
|
1003 |
+
*args,
|
1004 |
+
y,
|
1005 |
+
X_units,
|
1006 |
+
y_units,
|
1007 |
+
)
|
1008 |
+
|
1009 |
+
def test_unit_propagation(self):
|
1010 |
+
"""Check that units are propagated correctly.
|
1011 |
+
|
1012 |
+
This also tests that variables have the correct names.
|
1013 |
+
"""
|
1014 |
+
X = np.ones((100, 3))
|
1015 |
+
y = np.ones((100, 1))
|
1016 |
+
temp_dir = Path(tempfile.mkdtemp())
|
1017 |
+
equation_file = str(temp_dir / "equation_file.csv")
|
1018 |
+
model = PySRRegressor(
|
1019 |
+
binary_operators=["+", "*"],
|
1020 |
+
early_stop_condition="(l, c) -> l < 1e-6 && c == 3",
|
1021 |
+
progress=False,
|
1022 |
+
model_selection="accuracy",
|
1023 |
+
niterations=DEFAULT_NITERATIONS * 2,
|
1024 |
+
populations=DEFAULT_POPULATIONS * 2,
|
1025 |
+
complexity_of_constants=10,
|
1026 |
+
weight_mutate_constant=0.0,
|
1027 |
+
should_optimize_constants=False,
|
1028 |
+
multithreading=False,
|
1029 |
+
deterministic=True,
|
1030 |
+
procs=0,
|
1031 |
+
random_state=0,
|
1032 |
+
equation_file=equation_file,
|
1033 |
+
warm_start=True,
|
1034 |
+
)
|
1035 |
+
model.fit(
|
1036 |
+
X,
|
1037 |
+
y,
|
1038 |
+
X_units=["m", "s", "A"],
|
1039 |
+
y_units=["m*A"],
|
1040 |
+
)
|
1041 |
+
best = model.get_best()
|
1042 |
+
self.assertIn("x0", best["equation"])
|
1043 |
+
self.assertNotIn("x1", best["equation"])
|
1044 |
+
self.assertIn("x2", best["equation"])
|
1045 |
+
self.assertEqual(best["complexity"], 3)
|
1046 |
+
self.assertEqual(model.equations_.iloc[0].complexity, 1)
|
1047 |
+
self.assertGreater(model.equations_.iloc[0].loss, 1e-6)
|
1048 |
+
|
1049 |
+
# With pkl file:
|
1050 |
+
pkl_file = str(temp_dir / "equation_file.pkl")
|
1051 |
+
model2 = PySRRegressor.from_file(pkl_file)
|
1052 |
+
best2 = model2.get_best()
|
1053 |
+
self.assertIn("x0", best2["equation"])
|
1054 |
+
|
1055 |
+
# From csv file alone (we need to delete pkl file:)
|
1056 |
+
# First, we delete the pkl file:
|
1057 |
+
os.remove(pkl_file)
|
1058 |
+
model3 = PySRRegressor.from_file(
|
1059 |
+
equation_file, binary_operators=["+", "*"], n_features_in=X.shape[1]
|
1060 |
+
)
|
1061 |
+
best3 = model3.get_best()
|
1062 |
+
self.assertIn("x0", best3["equation"])
|
1063 |
+
|
1064 |
+
# Try warm start, but with no units provided (should
|
1065 |
+
# be a different dataset, and thus different result):
|
1066 |
+
model.fit(X, y)
|
1067 |
+
model.early_stop_condition = "(l, c) -> l < 1e-6 && c == 1"
|
1068 |
+
self.assertEqual(model.equations_.iloc[0].complexity, 1)
|
1069 |
+
self.assertLess(model.equations_.iloc[0].loss, 1e-6)
|
1070 |
+
|
1071 |
+
|
1072 |
+
# TODO: Determine desired behavior if second .fit() call does not have units
|
1073 |
+
|
1074 |
+
|
1075 |
def runtests():
|
1076 |
"""Run all tests in test.py."""
|
1077 |
suite = unittest.TestSuite()
|
|
|
1082 |
TestFeatureSelection,
|
1083 |
TestMiscellaneous,
|
1084 |
TestLaTeXTable,
|
1085 |
+
TestDimensionalConstraints,
|
1086 |
]
|
1087 |
for test_case in test_cases:
|
1088 |
tests = loader.loadTestsFromTestCase(test_case)
|
pysr/version.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
__version__ = "0.
|
2 |
-
__symbolic_regression_jl_version__ = "0.
|
|
|
1 |
+
__version__ = "0.15.0"
|
2 |
+
__symbolic_regression_jl_version__ = "0.21.3"
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
sympy
|
2 |
-
pandas
|
3 |
numpy
|
4 |
scikit_learn>=1.0.0
|
5 |
julia>=0.6.0
|
|
|
1 |
sympy
|
2 |
+
pandas>=0.21.0
|
3 |
numpy
|
4 |
scikit_learn>=1.0.0
|
5 |
julia>=0.6.0
|