Spaces:
Running
Running
MilesCranmer
commited on
Merge branch 'master' into gui
Browse files- .github/workflows/CI.yml +17 -23
- .github/workflows/CI_Windows.yml +8 -14
- .github/workflows/CI_docker.yml +6 -12
- .github/workflows/CI_docker_large_nightly.yml +2 -2
- .github/workflows/CI_large_nightly.yml +3 -3
- .github/workflows/CI_mac.yml +7 -13
- .github/workflows/docker_deploy.yml +6 -6
- .github/workflows/update_backend.yml +0 -1
- .gitignore +2 -0
- .pre-commit-config.yaml +3 -3
- Dockerfile +2 -2
- README.md +1 -1
- benchmarks/hyperparamopt.py +1 -0
- benchmarks/print_best_model.py +1 -0
- docs/examples.md +3 -1
- docs/generate_papers.py +1 -0
- environment.yml +1 -2
- examples/pysr_demo.ipynb +1 -1
- pyproject.toml +16 -2
- pysr/denoising.py +18 -4
- pysr/deprecated.py +1 -0
- pysr/export_jax.py +4 -1
- pysr/export_latex.py +13 -0
- pysr/export_numpy.py +11 -2
- pysr/export_sympy.py +15 -6
- pysr/export_torch.py +9 -6
- pysr/feature_selection.py +20 -3
- pysr/julia_helpers.py +17 -5
- pysr/julia_import.py +9 -21
- pysr/juliapkg.json +1 -1
- pysr/param_groupings.yml +1 -0
- pysr/sklearn_monkeypatch.py +1 -2
- pysr/sr.py +381 -193
- pysr/test/__main__.py +1 -0
- pysr/test/params.py +1 -1
- pysr/test/test.py +220 -106
- pysr/test/test_jax.py +41 -10
- pysr/test/test_startup.py +3 -6
- pysr/test/test_torch.py +36 -2
- pysr/utils.py +22 -2
- requirements.txt +2 -3
.github/workflows/CI.yml
CHANGED
@@ -5,20 +5,14 @@ on:
|
|
5 |
branches:
|
6 |
- '**'
|
7 |
paths:
|
8 |
-
- '
|
9 |
-
- 'pysr/**'
|
10 |
-
- '.github/workflows/CI.yml'
|
11 |
-
- 'setup.py'
|
12 |
tags:
|
13 |
- 'v*.*.*'
|
14 |
pull_request:
|
15 |
branches:
|
16 |
-
- '
|
17 |
paths:
|
18 |
-
- '
|
19 |
-
- 'pysr/**'
|
20 |
-
- '.github/workflows/CI.yml'
|
21 |
-
- 'setup.py'
|
22 |
|
23 |
jobs:
|
24 |
test:
|
@@ -32,12 +26,12 @@ jobs:
|
|
32 |
strategy:
|
33 |
matrix:
|
34 |
julia-version: ['1']
|
35 |
-
python-version: ['3.
|
36 |
os: [ubuntu-latest]
|
37 |
test-id: [main]
|
38 |
include:
|
39 |
- julia-version: '1.6'
|
40 |
-
python-version: '3.
|
41 |
os: ubuntu-latest
|
42 |
test-id: include
|
43 |
- julia-version: '1'
|
@@ -48,11 +42,11 @@ jobs:
|
|
48 |
steps:
|
49 |
- uses: actions/checkout@v4
|
50 |
- name: "Set up Julia"
|
51 |
-
uses: julia-actions/setup-julia@
|
52 |
with:
|
53 |
version: ${{ matrix.julia-version }}
|
54 |
- name: "Cache Julia"
|
55 |
-
uses: julia-actions/cache@
|
56 |
with:
|
57 |
cache-name: ${{ matrix.os }}-test-${{ matrix.julia-version }}-${{ matrix.python-version }}
|
58 |
cache-packages: false
|
@@ -90,7 +84,7 @@ jobs:
|
|
90 |
- name: "Coveralls"
|
91 |
env:
|
92 |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
93 |
-
COVERALLS_FLAG_NAME: test-${{ matrix.julia-version }}-${{ matrix.python-version }}
|
94 |
COVERALLS_PARALLEL: true
|
95 |
run: coveralls --service=github
|
96 |
|
@@ -99,11 +93,11 @@ jobs:
|
|
99 |
strategy:
|
100 |
matrix:
|
101 |
os: ['ubuntu-latest']
|
102 |
-
python-version: ['3.
|
103 |
julia-version: ['1']
|
104 |
include:
|
105 |
- os: ubuntu-latest
|
106 |
-
python-version: '3.
|
107 |
julia-version: '1.6'
|
108 |
steps:
|
109 |
- uses: actions/checkout@v4
|
@@ -122,7 +116,7 @@ jobs:
|
|
122 |
shell: bash -l {0}
|
123 |
strategy:
|
124 |
matrix:
|
125 |
-
python-version: ['3.
|
126 |
os: ['ubuntu-latest']
|
127 |
|
128 |
steps:
|
@@ -144,7 +138,7 @@ jobs:
|
|
144 |
activate-environment: pysr-test
|
145 |
environment-file: environment.yml
|
146 |
- name: "Cache Julia"
|
147 |
-
uses: julia-actions/cache@
|
148 |
with:
|
149 |
cache-name: ${{ matrix.os }}-conda-${{ matrix.python-version }}
|
150 |
cache-packages: false
|
@@ -181,8 +175,8 @@ jobs:
|
|
181 |
strategy:
|
182 |
matrix:
|
183 |
python-version:
|
184 |
-
- '3.
|
185 |
-
- '3.
|
186 |
os: ['ubuntu-latest']
|
187 |
|
188 |
steps:
|
@@ -199,10 +193,10 @@ jobs:
|
|
199 |
pip install mypy
|
200 |
- name: "Install additional dependencies"
|
201 |
run: python -m pip install jax jaxlib torch
|
202 |
-
if: ${{ matrix.python-version != '3.
|
203 |
- name: "Run mypy"
|
204 |
run: python -m mypy --install-types --non-interactive pysr
|
205 |
-
if: ${{ matrix.python-version != '3.
|
206 |
- name: "Run compatible mypy"
|
207 |
run: python -m mypy --ignore-missing-imports pysr
|
208 |
-
if: ${{ matrix.python-version == '3.
|
|
|
5 |
branches:
|
6 |
- '**'
|
7 |
paths:
|
8 |
+
- '**'
|
|
|
|
|
|
|
9 |
tags:
|
10 |
- 'v*.*.*'
|
11 |
pull_request:
|
12 |
branches:
|
13 |
+
- 'master'
|
14 |
paths:
|
15 |
+
- '**'
|
|
|
|
|
|
|
16 |
|
17 |
jobs:
|
18 |
test:
|
|
|
26 |
strategy:
|
27 |
matrix:
|
28 |
julia-version: ['1']
|
29 |
+
python-version: ['3.12']
|
30 |
os: [ubuntu-latest]
|
31 |
test-id: [main]
|
32 |
include:
|
33 |
- julia-version: '1.6'
|
34 |
+
python-version: '3.8'
|
35 |
os: ubuntu-latest
|
36 |
test-id: include
|
37 |
- julia-version: '1'
|
|
|
42 |
steps:
|
43 |
- uses: actions/checkout@v4
|
44 |
- name: "Set up Julia"
|
45 |
+
uses: julia-actions/setup-julia@v2
|
46 |
with:
|
47 |
version: ${{ matrix.julia-version }}
|
48 |
- name: "Cache Julia"
|
49 |
+
uses: julia-actions/cache@v2
|
50 |
with:
|
51 |
cache-name: ${{ matrix.os }}-test-${{ matrix.julia-version }}-${{ matrix.python-version }}
|
52 |
cache-packages: false
|
|
|
84 |
- name: "Coveralls"
|
85 |
env:
|
86 |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
87 |
+
COVERALLS_FLAG_NAME: test-${{ matrix.julia-version }}-${{ matrix.python-version }}-${{ matrix.test-id }}
|
88 |
COVERALLS_PARALLEL: true
|
89 |
run: coveralls --service=github
|
90 |
|
|
|
93 |
strategy:
|
94 |
matrix:
|
95 |
os: ['ubuntu-latest']
|
96 |
+
python-version: ['3.12']
|
97 |
julia-version: ['1']
|
98 |
include:
|
99 |
- os: ubuntu-latest
|
100 |
+
python-version: '3.8'
|
101 |
julia-version: '1.6'
|
102 |
steps:
|
103 |
- uses: actions/checkout@v4
|
|
|
116 |
shell: bash -l {0}
|
117 |
strategy:
|
118 |
matrix:
|
119 |
+
python-version: ['3.12']
|
120 |
os: ['ubuntu-latest']
|
121 |
|
122 |
steps:
|
|
|
138 |
activate-environment: pysr-test
|
139 |
environment-file: environment.yml
|
140 |
- name: "Cache Julia"
|
141 |
+
uses: julia-actions/cache@v2
|
142 |
with:
|
143 |
cache-name: ${{ matrix.os }}-conda-${{ matrix.python-version }}
|
144 |
cache-packages: false
|
|
|
175 |
strategy:
|
176 |
matrix:
|
177 |
python-version:
|
178 |
+
- '3.12'
|
179 |
+
- '3.8'
|
180 |
os: ['ubuntu-latest']
|
181 |
|
182 |
steps:
|
|
|
193 |
pip install mypy
|
194 |
- name: "Install additional dependencies"
|
195 |
run: python -m pip install jax jaxlib torch
|
196 |
+
if: ${{ matrix.python-version != '3.8' }}
|
197 |
- name: "Run mypy"
|
198 |
run: python -m mypy --install-types --non-interactive pysr
|
199 |
+
if: ${{ matrix.python-version != '3.8' }}
|
200 |
- name: "Run compatible mypy"
|
201 |
run: python -m mypy --ignore-missing-imports pysr
|
202 |
+
if: ${{ matrix.python-version == '3.8' }}
|
.github/workflows/CI_Windows.yml
CHANGED
@@ -3,22 +3,16 @@ name: Windows
|
|
3 |
on:
|
4 |
push:
|
5 |
branches:
|
6 |
-
- '
|
7 |
paths:
|
8 |
-
- '
|
9 |
-
- 'pysr/**'
|
10 |
-
- '.github/workflows/CI_Windows.yml'
|
11 |
-
- 'setup.py'
|
12 |
tags:
|
13 |
- 'v*.*.*'
|
14 |
pull_request:
|
15 |
branches:
|
16 |
-
- '
|
17 |
paths:
|
18 |
-
- '
|
19 |
-
- 'pysr/**'
|
20 |
-
- '.github/workflows/CI_Windows.yml'
|
21 |
-
- 'setup.py'
|
22 |
|
23 |
jobs:
|
24 |
test:
|
@@ -30,17 +24,17 @@ jobs:
|
|
30 |
strategy:
|
31 |
matrix:
|
32 |
julia-version: ['1']
|
33 |
-
python-version: ['3.
|
34 |
os: [windows-latest]
|
35 |
|
36 |
steps:
|
37 |
- uses: actions/checkout@v4
|
38 |
- name: "Set up Julia"
|
39 |
-
uses: julia-actions/setup-julia@
|
40 |
with:
|
41 |
version: ${{ matrix.julia-version }}
|
42 |
- name: "Cache Julia"
|
43 |
-
uses: julia-actions/cache@
|
44 |
with:
|
45 |
cache-name: ${{ matrix.os }}-test-${{ matrix.julia-version }}-${{ matrix.python-version }}
|
46 |
cache-packages: false
|
@@ -52,7 +46,7 @@ jobs:
|
|
52 |
- name: "Install PySR"
|
53 |
run: |
|
54 |
python -m pip install --upgrade pip
|
55 |
-
pip install pytest nbval
|
56 |
pip install .
|
57 |
python -c 'import pysr'
|
58 |
- name: "Run tests"
|
|
|
3 |
on:
|
4 |
push:
|
5 |
branches:
|
6 |
+
- 'master'
|
7 |
paths:
|
8 |
+
- '**'
|
|
|
|
|
|
|
9 |
tags:
|
10 |
- 'v*.*.*'
|
11 |
pull_request:
|
12 |
branches:
|
13 |
+
- 'master'
|
14 |
paths:
|
15 |
+
- '**'
|
|
|
|
|
|
|
16 |
|
17 |
jobs:
|
18 |
test:
|
|
|
24 |
strategy:
|
25 |
matrix:
|
26 |
julia-version: ['1']
|
27 |
+
python-version: ['3.12']
|
28 |
os: [windows-latest]
|
29 |
|
30 |
steps:
|
31 |
- uses: actions/checkout@v4
|
32 |
- name: "Set up Julia"
|
33 |
+
uses: julia-actions/setup-julia@v2
|
34 |
with:
|
35 |
version: ${{ matrix.julia-version }}
|
36 |
- name: "Cache Julia"
|
37 |
+
uses: julia-actions/cache@v2
|
38 |
with:
|
39 |
cache-name: ${{ matrix.os }}-test-${{ matrix.julia-version }}-${{ matrix.python-version }}
|
40 |
cache-packages: false
|
|
|
46 |
- name: "Install PySR"
|
47 |
run: |
|
48 |
python -m pip install --upgrade pip
|
49 |
+
pip install pytest nbval "numpy<2.0.0"
|
50 |
pip install .
|
51 |
python -c 'import pysr'
|
52 |
- name: "Run tests"
|
.github/workflows/CI_docker.yml
CHANGED
@@ -3,22 +3,16 @@ name: Docker
|
|
3 |
on:
|
4 |
push:
|
5 |
branches:
|
6 |
-
- '
|
7 |
paths:
|
8 |
-
- '
|
9 |
-
|
10 |
-
- '
|
11 |
-
- 'setup.py'
|
12 |
-
- 'Dockerfile'
|
13 |
pull_request:
|
14 |
branches:
|
15 |
-
- '
|
16 |
paths:
|
17 |
-
- '
|
18 |
-
- 'pysr/**'
|
19 |
-
- '.github/workflows/CI_docker.yml'
|
20 |
-
- 'setup.py'
|
21 |
-
- 'Dockerfile'
|
22 |
|
23 |
jobs:
|
24 |
test:
|
|
|
3 |
on:
|
4 |
push:
|
5 |
branches:
|
6 |
+
- 'master'
|
7 |
paths:
|
8 |
+
- '**'
|
9 |
+
tags:
|
10 |
+
- 'v*.*.*'
|
|
|
|
|
11 |
pull_request:
|
12 |
branches:
|
13 |
+
- 'master'
|
14 |
paths:
|
15 |
+
- '**'
|
|
|
|
|
|
|
|
|
16 |
|
17 |
jobs:
|
18 |
test:
|
.github/workflows/CI_docker_large_nightly.yml
CHANGED
@@ -19,7 +19,7 @@ jobs:
|
|
19 |
fail-fast: false
|
20 |
matrix:
|
21 |
julia-version: ['1.6', '1']
|
22 |
-
python-version: ['3.
|
23 |
os: [ubuntu-latest]
|
24 |
arch: ['linux/amd64', 'linux/arm64']
|
25 |
|
@@ -27,7 +27,7 @@ jobs:
|
|
27 |
steps:
|
28 |
- uses: actions/checkout@v4
|
29 |
- name: Set up QEMU
|
30 |
-
uses: docker/setup-qemu-action@
|
31 |
with:
|
32 |
platforms: all
|
33 |
- name: Build docker
|
|
|
19 |
fail-fast: false
|
20 |
matrix:
|
21 |
julia-version: ['1.6', '1']
|
22 |
+
python-version: ['3.8', '3.12']
|
23 |
os: [ubuntu-latest]
|
24 |
arch: ['linux/amd64', 'linux/arm64']
|
25 |
|
|
|
27 |
steps:
|
28 |
- uses: actions/checkout@v4
|
29 |
- name: Set up QEMU
|
30 |
+
uses: docker/setup-qemu-action@v3
|
31 |
with:
|
32 |
platforms: all
|
33 |
- name: Build docker
|
.github/workflows/CI_large_nightly.yml
CHANGED
@@ -23,14 +23,14 @@ jobs:
|
|
23 |
strategy:
|
24 |
fail-fast: false
|
25 |
matrix:
|
26 |
-
julia-version: ['1.6', '1.8', '1.
|
27 |
-
python-version: ['3.
|
28 |
os: [ubuntu-latest, macos-latest, windows-latest]
|
29 |
|
30 |
steps:
|
31 |
- uses: actions/checkout@v4
|
32 |
- name: "Set up Julia"
|
33 |
-
uses: julia-actions/setup-julia@
|
34 |
with:
|
35 |
version: ${{ matrix.julia-version }}
|
36 |
- name: "Set up Python"
|
|
|
23 |
strategy:
|
24 |
fail-fast: false
|
25 |
matrix:
|
26 |
+
julia-version: ['1.6', '1.8', '1.10']
|
27 |
+
python-version: ['3.8', '3.10', '3.12']
|
28 |
os: [ubuntu-latest, macos-latest, windows-latest]
|
29 |
|
30 |
steps:
|
31 |
- uses: actions/checkout@v4
|
32 |
- name: "Set up Julia"
|
33 |
+
uses: julia-actions/setup-julia@v2
|
34 |
with:
|
35 |
version: ${{ matrix.julia-version }}
|
36 |
- name: "Set up Python"
|
.github/workflows/CI_mac.yml
CHANGED
@@ -3,22 +3,16 @@ name: macOS
|
|
3 |
on:
|
4 |
push:
|
5 |
branches:
|
6 |
-
- '
|
7 |
paths:
|
8 |
-
- '
|
9 |
-
- 'pysr/**'
|
10 |
-
- '.github/workflows/CI_mac.yml'
|
11 |
-
- 'setup.py'
|
12 |
tags:
|
13 |
- 'v*.*.*'
|
14 |
pull_request:
|
15 |
branches:
|
16 |
-
- '
|
17 |
paths:
|
18 |
-
- '
|
19 |
-
- 'pysr/**'
|
20 |
-
- '.github/workflows/CI_mac.yml'
|
21 |
-
- 'setup.py'
|
22 |
|
23 |
jobs:
|
24 |
test:
|
@@ -30,17 +24,17 @@ jobs:
|
|
30 |
strategy:
|
31 |
matrix:
|
32 |
julia-version: ['1']
|
33 |
-
python-version: ['3.
|
34 |
os: [macos-latest]
|
35 |
|
36 |
steps:
|
37 |
- uses: actions/checkout@v4
|
38 |
- name: "Set up Julia"
|
39 |
-
uses: julia-actions/setup-julia@
|
40 |
with:
|
41 |
version: ${{ matrix.julia-version }}
|
42 |
- name: "Cache Julia"
|
43 |
-
uses: julia-actions/cache@
|
44 |
with:
|
45 |
cache-name: ${{ matrix.os }}-test-${{ matrix.julia-version }}-${{ matrix.python-version }}
|
46 |
cache-packages: false
|
|
|
3 |
on:
|
4 |
push:
|
5 |
branches:
|
6 |
+
- 'master'
|
7 |
paths:
|
8 |
+
- '**'
|
|
|
|
|
|
|
9 |
tags:
|
10 |
- 'v*.*.*'
|
11 |
pull_request:
|
12 |
branches:
|
13 |
+
- 'master'
|
14 |
paths:
|
15 |
+
- '**'
|
|
|
|
|
|
|
16 |
|
17 |
jobs:
|
18 |
test:
|
|
|
24 |
strategy:
|
25 |
matrix:
|
26 |
julia-version: ['1']
|
27 |
+
python-version: ['3.12']
|
28 |
os: [macos-latest]
|
29 |
|
30 |
steps:
|
31 |
- uses: actions/checkout@v4
|
32 |
- name: "Set up Julia"
|
33 |
+
uses: julia-actions/setup-julia@v2
|
34 |
with:
|
35 |
version: ${{ matrix.julia-version }}
|
36 |
- name: "Cache Julia"
|
37 |
+
uses: julia-actions/cache@v2
|
38 |
with:
|
39 |
cache-name: ${{ matrix.os }}-test-${{ matrix.julia-version }}-${{ matrix.python-version }}
|
40 |
cache-packages: false
|
.github/workflows/docker_deploy.yml
CHANGED
@@ -18,19 +18,19 @@ jobs:
|
|
18 |
matrix:
|
19 |
os: [ubuntu-latest]
|
20 |
arch: [linux/amd64]
|
21 |
-
python-version: [3.
|
22 |
-
julia-version: [1.
|
23 |
steps:
|
24 |
- name: Checkout
|
25 |
uses: actions/checkout@v4
|
26 |
- name: Login to Docker Hub
|
27 |
-
uses: docker/login-action@
|
28 |
if: github.event_name != 'pull_request'
|
29 |
with:
|
30 |
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
31 |
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
32 |
- name: Login to GitHub registry
|
33 |
-
uses: docker/login-action@
|
34 |
if: github.event_name != 'pull_request'
|
35 |
with:
|
36 |
registry: ghcr.io
|
@@ -55,11 +55,11 @@ jobs:
|
|
55 |
type=sha
|
56 |
type=raw,value=latest,enable={{is_default_branch}}
|
57 |
- name: Set up QEMU
|
58 |
-
uses: docker/setup-qemu-action@
|
59 |
- name: Set up Docker Buildx
|
60 |
uses: docker/setup-buildx-action@v3
|
61 |
- name: Build and push
|
62 |
-
uses: docker/build-push-action@
|
63 |
with:
|
64 |
context: .
|
65 |
platforms: ${{ matrix.arch }}
|
|
|
18 |
matrix:
|
19 |
os: [ubuntu-latest]
|
20 |
arch: [linux/amd64]
|
21 |
+
python-version: [3.12.3]
|
22 |
+
julia-version: [1.10.3]
|
23 |
steps:
|
24 |
- name: Checkout
|
25 |
uses: actions/checkout@v4
|
26 |
- name: Login to Docker Hub
|
27 |
+
uses: docker/login-action@v3
|
28 |
if: github.event_name != 'pull_request'
|
29 |
with:
|
30 |
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
31 |
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
32 |
- name: Login to GitHub registry
|
33 |
+
uses: docker/login-action@v3
|
34 |
if: github.event_name != 'pull_request'
|
35 |
with:
|
36 |
registry: ghcr.io
|
|
|
55 |
type=sha
|
56 |
type=raw,value=latest,enable={{is_default_branch}}
|
57 |
- name: Set up QEMU
|
58 |
+
uses: docker/setup-qemu-action@v3
|
59 |
- name: Set up Docker Buildx
|
60 |
uses: docker/setup-buildx-action@v3
|
61 |
- name: Build and push
|
62 |
+
uses: docker/build-push-action@v6
|
63 |
with:
|
64 |
context: .
|
65 |
platforms: ${{ matrix.arch }}
|
.github/workflows/update_backend.yml
CHANGED
@@ -40,7 +40,6 @@ jobs:
|
|
40 |
- name: "Create PR if necessary"
|
41 |
uses: peter-evans/create-pull-request@v6
|
42 |
with:
|
43 |
-
token: ${{ secrets.REPO_SCOPED_TOKEN }}
|
44 |
title: "Automated update to backend: v${{ steps.get-latest.outputs.version }}"
|
45 |
body: |
|
46 |
This PR was automatically generated by the GitHub Action `.github/workflows/update-backend.yml`
|
|
|
40 |
- name: "Create PR if necessary"
|
41 |
uses: peter-evans/create-pull-request@v6
|
42 |
with:
|
|
|
43 |
title: "Automated update to backend: v${{ steps.get-latest.outputs.version }}"
|
44 |
body: |
|
45 |
This PR was automatically generated by the GitHub Action `.github/workflows/update-backend.yml`
|
.gitignore
CHANGED
@@ -23,3 +23,5 @@ site
|
|
23 |
**/*.code-workspace
|
24 |
**/*.tar.gz
|
25 |
venv
|
|
|
|
|
|
23 |
**/*.code-workspace
|
24 |
**/*.tar.gz
|
25 |
venv
|
26 |
+
requirements-dev.lock
|
27 |
+
requirements.lock
|
.pre-commit-config.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
repos:
|
2 |
# General linting
|
3 |
- repo: https://github.com/pre-commit/pre-commit-hooks
|
4 |
-
rev: v4.
|
5 |
hooks:
|
6 |
- id: trailing-whitespace
|
7 |
- id: end-of-file-fixer
|
@@ -9,14 +9,14 @@ repos:
|
|
9 |
- id: check-added-large-files
|
10 |
# General formatting
|
11 |
- repo: https://github.com/psf/black
|
12 |
-
rev:
|
13 |
hooks:
|
14 |
- id: black
|
15 |
- id: black-jupyter
|
16 |
exclude: pysr/test/test_nb.ipynb
|
17 |
# Stripping notebooks
|
18 |
- repo: https://github.com/kynan/nbstripout
|
19 |
-
rev: 0.
|
20 |
hooks:
|
21 |
- id: nbstripout
|
22 |
exclude: pysr/test/test_nb.ipynb
|
|
|
1 |
repos:
|
2 |
# General linting
|
3 |
- repo: https://github.com/pre-commit/pre-commit-hooks
|
4 |
+
rev: v4.6.0
|
5 |
hooks:
|
6 |
- id: trailing-whitespace
|
7 |
- id: end-of-file-fixer
|
|
|
9 |
- id: check-added-large-files
|
10 |
# General formatting
|
11 |
- repo: https://github.com/psf/black
|
12 |
+
rev: 24.4.2
|
13 |
hooks:
|
14 |
- id: black
|
15 |
- id: black-jupyter
|
16 |
exclude: pysr/test/test_nb.ipynb
|
17 |
# Stripping notebooks
|
18 |
- repo: https://github.com/kynan/nbstripout
|
19 |
+
rev: 0.7.1
|
20 |
hooks:
|
21 |
- id: nbstripout
|
22 |
exclude: pysr/test/test_nb.ipynb
|
Dockerfile
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
# This builds a dockerfile containing a working copy of PySR
|
2 |
# with all pre-requisites installed.
|
3 |
|
4 |
-
ARG JLVERSION=1.10.
|
5 |
-
ARG PYVERSION=3.
|
6 |
ARG BASE_IMAGE=bullseye
|
7 |
|
8 |
FROM julia:${JLVERSION}-${BASE_IMAGE} AS jl
|
|
|
1 |
# This builds a dockerfile containing a working copy of PySR
|
2 |
# with all pre-requisites installed.
|
3 |
|
4 |
+
ARG JLVERSION=1.10.4
|
5 |
+
ARG PYVERSION=3.12.2
|
6 |
ARG BASE_IMAGE=bullseye
|
7 |
|
8 |
FROM julia:${JLVERSION}-${BASE_IMAGE} AS jl
|
README.md
CHANGED
@@ -297,7 +297,7 @@ model = PySRRegressor(
|
|
297 |
# ^ Higher precision calculations.
|
298 |
warm_start=True,
|
299 |
# ^ Start from where left off.
|
300 |
-
|
301 |
# ^ Faster evaluation (experimental)
|
302 |
extra_sympy_mappings={"cos2": lambda x: sympy.cos(x)**2},
|
303 |
# extra_torch_mappings={sympy.cos: torch.cos},
|
|
|
297 |
# ^ Higher precision calculations.
|
298 |
warm_start=True,
|
299 |
# ^ Start from where left off.
|
300 |
+
turbo=True,
|
301 |
# ^ Faster evaluation (experimental)
|
302 |
extra_sympy_mappings={"cos2": lambda x: sympy.cos(x)**2},
|
303 |
# extra_torch_mappings={sympy.cos: torch.cos},
|
benchmarks/hyperparamopt.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
"""Start a hyperoptimization from a single node"""
|
|
|
2 |
import pickle as pkl
|
3 |
import sys
|
4 |
|
|
|
1 |
"""Start a hyperoptimization from a single node"""
|
2 |
+
|
3 |
import pickle as pkl
|
4 |
import sys
|
5 |
|
benchmarks/print_best_model.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
"""Print the best model parameters and loss"""
|
|
|
2 |
import pickle as pkl
|
3 |
from pprint import PrettyPrinter
|
4 |
|
|
|
1 |
"""Print the best model parameters and loss"""
|
2 |
+
|
3 |
import pickle as pkl
|
4 |
from pprint import PrettyPrinter
|
5 |
|
docs/examples.md
CHANGED
@@ -428,7 +428,7 @@ the evaluation, as we simply evaluated each argument and divided the result) int
|
|
428 |
`((2.3554819 + -0.3554746) - (x1 * (x0 * x0)))` and
|
429 |
`(-1.0000019 - (x2 * x2))`, meaning that our discovered equation is
|
430 |
equal to:
|
431 |
-
$\frac{x_0^2 x_1 - 2.0000073}{x_2^2
|
432 |
is nearly the same as the true equation!
|
433 |
|
434 |
## 10. Dimensional constraints
|
@@ -520,6 +520,8 @@ a constant `"2.6353e-22[m sβ»Β²]"`.
|
|
520 |
|
521 |
Note that this expression has a large dynamic range so may be difficult to find. Consider searching with a larger `niterations` if needed.
|
522 |
|
|
|
|
|
523 |
|
524 |
## 11. Additional features
|
525 |
|
|
|
428 |
`((2.3554819 + -0.3554746) - (x1 * (x0 * x0)))` and
|
429 |
`(-1.0000019 - (x2 * x2))`, meaning that our discovered equation is
|
430 |
equal to:
|
431 |
+
$\frac{x_0^2 x_1 - 2.0000073}{x_2^2 + 1.0000019}$, which
|
432 |
is nearly the same as the true equation!
|
433 |
|
434 |
## 10. Dimensional constraints
|
|
|
520 |
|
521 |
Note that this expression has a large dynamic range so may be difficult to find. Consider searching with a larger `niterations` if needed.
|
522 |
|
523 |
+
Note that you can also search for exclusively dimensionless constants by settings
|
524 |
+
`dimensionless_constants_only` to `true`.
|
525 |
|
526 |
## 11. Additional features
|
527 |
|
docs/generate_papers.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
"""This script generates the papers.md file from the papers.yml file."""
|
|
|
2 |
from pathlib import Path
|
3 |
|
4 |
import yaml
|
|
|
1 |
"""This script generates the papers.md file from the papers.yml file."""
|
2 |
+
|
3 |
from pathlib import Path
|
4 |
|
5 |
import yaml
|
environment.yml
CHANGED
@@ -2,11 +2,10 @@ name: test
|
|
2 |
channels:
|
3 |
- conda-forge
|
4 |
dependencies:
|
5 |
-
- python>=3.
|
6 |
- sympy>=1.0.0,<2.0.0
|
7 |
- pandas>=0.21.0,<3.0.0
|
8 |
- numpy>=1.13.0,<2.0.0
|
9 |
- scikit-learn>=1.0.0,<2.0.0
|
10 |
- pyjuliacall>=0.9.15,<0.10.0
|
11 |
- click>=7.0.0,<9.0.0
|
12 |
-
- typing_extensions>=4.0.0,<5.0.0
|
|
|
2 |
channels:
|
3 |
- conda-forge
|
4 |
dependencies:
|
5 |
+
- python>=3.8
|
6 |
- sympy>=1.0.0,<2.0.0
|
7 |
- pandas>=0.21.0,<3.0.0
|
8 |
- numpy>=1.13.0,<2.0.0
|
9 |
- scikit-learn>=1.0.0,<2.0.0
|
10 |
- pyjuliacall>=0.9.15,<0.10.0
|
11 |
- click>=7.0.0,<9.0.0
|
|
examples/pysr_demo.ipynb
CHANGED
@@ -396,7 +396,7 @@
|
|
396 |
"id": "wbWHyOjl2_kX"
|
397 |
},
|
398 |
"source": [
|
399 |
-
"Since `quart` is arguably more complex than the other operators, you can also give it a different complexity, using, e.g., `complexity_of_operators={\"quart\": 2}` to give it a complexity of 2 (instead of the default
|
400 |
"\n",
|
401 |
"\n",
|
402 |
"One can also add a binary operator, with, e.g., `\"myoperator(x, y) = x^2 * y\"`. All Julia operators that work on scalar 32-bit floating point values are available.\n",
|
|
|
396 |
"id": "wbWHyOjl2_kX"
|
397 |
},
|
398 |
"source": [
|
399 |
+
"Since `quart` is arguably more complex than the other operators, you can also give it a different complexity, using, e.g., `complexity_of_operators={\"quart\": 2}` to give it a complexity of 2 (instead of the default 1). You can also define custom complexities for variables and constants (`complexity_of_variables` and `complexity_of_constants`, respectively - both take a single number).\n",
|
400 |
"\n",
|
401 |
"\n",
|
402 |
"One can also add a binary operator, with, e.g., `\"myoperator(x, y) = x^2 * y\"`. All Julia operators that work on scalar 32-bit floating point values are available.\n",
|
pyproject.toml
CHANGED
@@ -4,14 +4,14 @@ build-backend = "setuptools.build_meta"
|
|
4 |
|
5 |
[project]
|
6 |
name = "pysr"
|
7 |
-
version = "0.
|
8 |
authors = [
|
9 |
{name = "Miles Cranmer", email = "miles.cranmer@gmail.com"},
|
10 |
]
|
11 |
description = "Simple and efficient symbolic regression"
|
12 |
readme = {file = "README.md", content-type = "text/markdown"}
|
13 |
license = {file = "LICENSE"}
|
14 |
-
requires-python = ">=3.
|
15 |
classifiers = [
|
16 |
"Programming Language :: Python :: 3",
|
17 |
"Operating System :: OS Independent",
|
@@ -29,3 +29,17 @@ dependencies = {file = "requirements.txt"}
|
|
29 |
|
30 |
[tool.isort]
|
31 |
profile = "black"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
[project]
|
6 |
name = "pysr"
|
7 |
+
version = "0.19.0"
|
8 |
authors = [
|
9 |
{name = "Miles Cranmer", email = "miles.cranmer@gmail.com"},
|
10 |
]
|
11 |
description = "Simple and efficient symbolic regression"
|
12 |
readme = {file = "README.md", content-type = "text/markdown"}
|
13 |
license = {file = "LICENSE"}
|
14 |
+
requires-python = ">=3.8"
|
15 |
classifiers = [
|
16 |
"Programming Language :: Python :: 3",
|
17 |
"Operating System :: OS Independent",
|
|
|
29 |
|
30 |
[tool.isort]
|
31 |
profile = "black"
|
32 |
+
|
33 |
+
[tool.rye]
|
34 |
+
dev-dependencies = [
|
35 |
+
"pre-commit>=3.7.0",
|
36 |
+
"ipython>=8.23.0",
|
37 |
+
"ipykernel>=6.29.4",
|
38 |
+
"mypy>=1.10.0",
|
39 |
+
"jax[cpu]>=0.4.26",
|
40 |
+
"torch>=2.3.0",
|
41 |
+
"pandas-stubs>=2.2.1.240316",
|
42 |
+
"types-pytz>=2024.1.0.20240417",
|
43 |
+
"types-openpyxl>=3.1.0.20240428",
|
44 |
+
"coverage>=7.5.3",
|
45 |
+
]
|
pysr/denoising.py
CHANGED
@@ -1,8 +1,17 @@
|
|
1 |
"""Functions for denoising data during preprocessing."""
|
|
|
|
|
|
|
2 |
import numpy as np
|
|
|
3 |
|
4 |
|
5 |
-
def denoise(
|
|
|
|
|
|
|
|
|
|
|
6 |
"""Denoise the dataset using a Gaussian process."""
|
7 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
8 |
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
@@ -14,12 +23,17 @@ def denoise(X, y, Xresampled=None, random_state=None):
|
|
14 |
gpr.fit(X, y)
|
15 |
|
16 |
if Xresampled is not None:
|
17 |
-
return Xresampled, gpr.predict(Xresampled)
|
18 |
|
19 |
-
return X, gpr.predict(X)
|
20 |
|
21 |
|
22 |
-
def multi_denoise(
|
|
|
|
|
|
|
|
|
|
|
23 |
"""Perform `denoise` along each column of `y` independently."""
|
24 |
y = np.stack(
|
25 |
[
|
|
|
1 |
"""Functions for denoising data during preprocessing."""
|
2 |
+
|
3 |
+
from typing import Optional, Tuple, cast
|
4 |
+
|
5 |
import numpy as np
|
6 |
+
from numpy import ndarray
|
7 |
|
8 |
|
9 |
+
def denoise(
|
10 |
+
X: ndarray,
|
11 |
+
y: ndarray,
|
12 |
+
Xresampled: Optional[ndarray] = None,
|
13 |
+
random_state: Optional[np.random.RandomState] = None,
|
14 |
+
) -> Tuple[ndarray, ndarray]:
|
15 |
"""Denoise the dataset using a Gaussian process."""
|
16 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
17 |
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
|
|
23 |
gpr.fit(X, y)
|
24 |
|
25 |
if Xresampled is not None:
|
26 |
+
return Xresampled, cast(ndarray, gpr.predict(Xresampled))
|
27 |
|
28 |
+
return X, cast(ndarray, gpr.predict(X))
|
29 |
|
30 |
|
31 |
+
def multi_denoise(
|
32 |
+
X: ndarray,
|
33 |
+
y: ndarray,
|
34 |
+
Xresampled: Optional[ndarray] = None,
|
35 |
+
random_state: Optional[np.random.RandomState] = None,
|
36 |
+
):
|
37 |
"""Perform `denoise` along each column of `y` independently."""
|
38 |
y = np.stack(
|
39 |
[
|
pysr/deprecated.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
"""Various functions to deprecate features."""
|
|
|
2 |
import warnings
|
3 |
|
4 |
from .julia_import import jl
|
|
|
1 |
"""Various functions to deprecate features."""
|
2 |
+
|
3 |
import warnings
|
4 |
|
5 |
from .julia_import import jl
|
pysr/export_jax.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import sympy
|
2 |
|
3 |
# Special since need to reduce arguments.
|
@@ -55,7 +56,9 @@ def sympy2jaxtext(expr, parameters, symbols_in, extra_jax_mappings=None):
|
|
55 |
if issubclass(expr.func, sympy.Float):
|
56 |
parameters.append(float(expr))
|
57 |
return f"parameters[{len(parameters) - 1}]"
|
58 |
-
elif issubclass(expr.func, sympy.Rational)
|
|
|
|
|
59 |
return f"{float(expr)}"
|
60 |
elif issubclass(expr.func, sympy.Integer):
|
61 |
return f"{int(expr)}"
|
|
|
1 |
+
import numpy as np # noqa: F401
|
2 |
import sympy
|
3 |
|
4 |
# Special since need to reduce arguments.
|
|
|
56 |
if issubclass(expr.func, sympy.Float):
|
57 |
parameters.append(float(expr))
|
58 |
return f"parameters[{len(parameters) - 1}]"
|
59 |
+
elif issubclass(expr.func, sympy.Rational) or issubclass(
|
60 |
+
expr.func, sympy.NumberSymbol
|
61 |
+
):
|
62 |
return f"{float(expr)}"
|
63 |
elif issubclass(expr.func, sympy.Integer):
|
64 |
return f"{int(expr)}"
|
pysr/export_latex.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
"""Functions to help export PySR equations to LaTeX."""
|
|
|
2 |
from typing import List, Optional, Tuple
|
3 |
|
4 |
import pandas as pd
|
@@ -152,3 +153,15 @@ def sympy2multilatextable(
|
|
152 |
]
|
153 |
|
154 |
return "\n\n".join(latex_tables)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""Functions to help export PySR equations to LaTeX."""
|
2 |
+
|
3 |
from typing import List, Optional, Tuple
|
4 |
|
5 |
import pandas as pd
|
|
|
153 |
]
|
154 |
|
155 |
return "\n\n".join(latex_tables)
|
156 |
+
|
157 |
+
|
158 |
+
def with_preamble(table_string: str) -> str:
|
159 |
+
preamble_string = [
|
160 |
+
r"\usepackage{breqn}",
|
161 |
+
r"\usepackage{booktabs}",
|
162 |
+
"",
|
163 |
+
"...",
|
164 |
+
"",
|
165 |
+
table_string,
|
166 |
+
]
|
167 |
+
return "\n".join(preamble_string)
|
pysr/export_numpy.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1 |
"""Code for exporting discovered expressions to numpy"""
|
|
|
2 |
import warnings
|
|
|
3 |
|
4 |
import numpy as np
|
5 |
import pandas as pd
|
6 |
-
from
|
|
|
7 |
|
8 |
|
9 |
def sympy2numpy(eqn, sympy_symbols, *, selection=None):
|
@@ -13,6 +16,10 @@ def sympy2numpy(eqn, sympy_symbols, *, selection=None):
|
|
13 |
class CallableEquation:
|
14 |
"""Simple wrapper for numpy lambda functions built with sympy"""
|
15 |
|
|
|
|
|
|
|
|
|
16 |
def __init__(self, eqn, sympy_symbols, selection=None):
|
17 |
self._sympy = eqn
|
18 |
self._sympy_symbols = sympy_symbols
|
@@ -28,8 +35,9 @@ class CallableEquation:
|
|
28 |
return self._lambda(
|
29 |
**{k: X[k].values for k in map(str, self._sympy_symbols)}
|
30 |
) * np.ones(expected_shape)
|
|
|
31 |
if self._selection is not None:
|
32 |
-
if X.shape[1] !=
|
33 |
warnings.warn(
|
34 |
"`X` should be of shape (n_samples, len(self._selection)). "
|
35 |
"Automatically filtering `X` to selection. "
|
@@ -37,6 +45,7 @@ class CallableEquation:
|
|
37 |
"this may lead to incorrect predictions and other errors."
|
38 |
)
|
39 |
X = X[:, self._selection]
|
|
|
40 |
return self._lambda(*X.T) * np.ones(expected_shape)
|
41 |
|
42 |
@property
|
|
|
1 |
"""Code for exporting discovered expressions to numpy"""
|
2 |
+
|
3 |
import warnings
|
4 |
+
from typing import List, Union
|
5 |
|
6 |
import numpy as np
|
7 |
import pandas as pd
|
8 |
+
from numpy.typing import NDArray
|
9 |
+
from sympy import Expr, Symbol, lambdify
|
10 |
|
11 |
|
12 |
def sympy2numpy(eqn, sympy_symbols, *, selection=None):
|
|
|
16 |
class CallableEquation:
|
17 |
"""Simple wrapper for numpy lambda functions built with sympy"""
|
18 |
|
19 |
+
_sympy: Expr
|
20 |
+
_sympy_symbols: List[Symbol]
|
21 |
+
_selection: Union[NDArray[np.bool_], None]
|
22 |
+
|
23 |
def __init__(self, eqn, sympy_symbols, selection=None):
|
24 |
self._sympy = eqn
|
25 |
self._sympy_symbols = sympy_symbols
|
|
|
35 |
return self._lambda(
|
36 |
**{k: X[k].values for k in map(str, self._sympy_symbols)}
|
37 |
) * np.ones(expected_shape)
|
38 |
+
|
39 |
if self._selection is not None:
|
40 |
+
if X.shape[1] != self._selection.sum():
|
41 |
warnings.warn(
|
42 |
"`X` should be of shape (n_samples, len(self._selection)). "
|
43 |
"Automatically filtering `X` to selection. "
|
|
|
45 |
"this may lead to incorrect predictions and other errors."
|
46 |
)
|
47 |
X = X[:, self._selection]
|
48 |
+
|
49 |
return self._lambda(*X.T) * np.ones(expected_shape)
|
50 |
|
51 |
@property
|
pysr/export_sympy.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1 |
"""Define utilities to export to sympy"""
|
|
|
2 |
from typing import Callable, Dict, List, Optional
|
3 |
|
4 |
import sympy
|
5 |
from sympy import sympify
|
6 |
|
|
|
|
|
7 |
sympy_mappings = {
|
8 |
"div": lambda x, y: x / y,
|
9 |
"mult": lambda x, y: x * y,
|
@@ -29,8 +32,8 @@ sympy_mappings = {
|
|
29 |
"acosh": lambda x: sympy.acosh(x),
|
30 |
"acosh_abs": lambda x: sympy.acosh(abs(x) + 1),
|
31 |
"asinh": sympy.asinh,
|
32 |
-
"atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1),
|
33 |
-
"atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1),
|
34 |
"abs": abs,
|
35 |
"mod": sympy.Mod,
|
36 |
"erf": sympy.erf,
|
@@ -50,6 +53,7 @@ sympy_mappings = {
|
|
50 |
"round": lambda x: sympy.ceiling(x - 0.5),
|
51 |
"max": lambda x, y: sympy.Piecewise((y, x < y), (x, True)),
|
52 |
"min": lambda x, y: sympy.Piecewise((x, x < y), (y, True)),
|
|
|
53 |
"cond": lambda x, y: sympy.Piecewise((y, x > 0), (0.0, True)),
|
54 |
"logical_or": lambda x, y: sympy.Piecewise((1.0, (x > 0) | (y > 0)), (0.0, True)),
|
55 |
"logical_and": lambda x, y: sympy.Piecewise((1.0, (x > 0) & (y > 0)), (0.0, True)),
|
@@ -58,13 +62,13 @@ sympy_mappings = {
|
|
58 |
|
59 |
|
60 |
def create_sympy_symbols_map(
|
61 |
-
feature_names_in:
|
62 |
) -> Dict[str, sympy.Symbol]:
|
63 |
return {variable: sympy.Symbol(variable) for variable in feature_names_in}
|
64 |
|
65 |
|
66 |
def create_sympy_symbols(
|
67 |
-
feature_names_in:
|
68 |
) -> List[sympy.Symbol]:
|
69 |
return [sympy.Symbol(variable) for variable in feature_names_in]
|
70 |
|
@@ -72,7 +76,7 @@ def create_sympy_symbols(
|
|
72 |
def pysr2sympy(
|
73 |
equation: str,
|
74 |
*,
|
75 |
-
feature_names_in: Optional[
|
76 |
extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
|
77 |
):
|
78 |
if feature_names_in is None:
|
@@ -83,7 +87,12 @@ def pysr2sympy(
|
|
83 |
**sympy_mappings,
|
84 |
}
|
85 |
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
|
89 |
def assert_valid_sympy_symbol(var_name: str) -> None:
|
|
|
1 |
"""Define utilities to export to sympy"""
|
2 |
+
|
3 |
from typing import Callable, Dict, List, Optional
|
4 |
|
5 |
import sympy
|
6 |
from sympy import sympify
|
7 |
|
8 |
+
from .utils import ArrayLike
|
9 |
+
|
10 |
sympy_mappings = {
|
11 |
"div": lambda x, y: x / y,
|
12 |
"mult": lambda x, y: x * y,
|
|
|
32 |
"acosh": lambda x: sympy.acosh(x),
|
33 |
"acosh_abs": lambda x: sympy.acosh(abs(x) + 1),
|
34 |
"asinh": sympy.asinh,
|
35 |
+
"atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - sympy.S(1)),
|
36 |
+
"atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - sympy.S(1)),
|
37 |
"abs": abs,
|
38 |
"mod": sympy.Mod,
|
39 |
"erf": sympy.erf,
|
|
|
53 |
"round": lambda x: sympy.ceiling(x - 0.5),
|
54 |
"max": lambda x, y: sympy.Piecewise((y, x < y), (x, True)),
|
55 |
"min": lambda x, y: sympy.Piecewise((x, x < y), (y, True)),
|
56 |
+
"greater": lambda x, y: sympy.Piecewise((1.0, x > y), (0.0, True)),
|
57 |
"cond": lambda x, y: sympy.Piecewise((y, x > 0), (0.0, True)),
|
58 |
"logical_or": lambda x, y: sympy.Piecewise((1.0, (x > 0) | (y > 0)), (0.0, True)),
|
59 |
"logical_and": lambda x, y: sympy.Piecewise((1.0, (x > 0) & (y > 0)), (0.0, True)),
|
|
|
62 |
|
63 |
|
64 |
def create_sympy_symbols_map(
|
65 |
+
feature_names_in: ArrayLike[str],
|
66 |
) -> Dict[str, sympy.Symbol]:
|
67 |
return {variable: sympy.Symbol(variable) for variable in feature_names_in}
|
68 |
|
69 |
|
70 |
def create_sympy_symbols(
|
71 |
+
feature_names_in: ArrayLike[str],
|
72 |
) -> List[sympy.Symbol]:
|
73 |
return [sympy.Symbol(variable) for variable in feature_names_in]
|
74 |
|
|
|
76 |
def pysr2sympy(
|
77 |
equation: str,
|
78 |
*,
|
79 |
+
feature_names_in: Optional[ArrayLike[str]] = None,
|
80 |
extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
|
81 |
):
|
82 |
if feature_names_in is None:
|
|
|
87 |
**sympy_mappings,
|
88 |
}
|
89 |
|
90 |
+
try:
|
91 |
+
return sympify(equation, locals=local_sympy_mappings, evaluate=False)
|
92 |
+
except TypeError as e:
|
93 |
+
if "got an unexpected keyword argument 'evaluate'" in str(e):
|
94 |
+
return sympify(equation, locals=local_sympy_mappings)
|
95 |
+
raise TypeError(f"Error processing equation '{equation}'") from e
|
96 |
|
97 |
|
98 |
def assert_valid_sympy_symbol(var_name: str) -> None:
|
pysr/export_torch.py
CHANGED
@@ -1,11 +1,9 @@
|
|
1 |
-
|
2 |
-
# From https://github.com/patrick-kidger/sympytorch
|
3 |
-
# Copied here to allow PySR-specific tweaks
|
4 |
-
#####
|
5 |
|
6 |
import collections as co
|
7 |
import functools as ft
|
8 |
|
|
|
9 |
import sympy
|
10 |
|
11 |
|
@@ -84,7 +82,7 @@ def _initialize_torch():
|
|
84 |
}
|
85 |
|
86 |
class _Node(torch.nn.Module):
|
87 |
-
"""
|
88 |
|
89 |
def __init__(self, *, expr, _memodict, _func_lookup, **kwargs):
|
90 |
super().__init__(**kwargs)
|
@@ -116,6 +114,11 @@ def _initialize_torch():
|
|
116 |
self._value = int(expr)
|
117 |
self._torch_func = lambda: self._value
|
118 |
self._args = ()
|
|
|
|
|
|
|
|
|
|
|
119 |
elif issubclass(expr.func, sympy.Symbol):
|
120 |
self._name = expr.name
|
121 |
self._torch_func = lambda value: value
|
@@ -156,7 +159,7 @@ def _initialize_torch():
|
|
156 |
return self._torch_func(*args)
|
157 |
|
158 |
class _SingleSymPyModule(torch.nn.Module):
|
159 |
-
"""
|
160 |
|
161 |
def __init__(
|
162 |
self, expression, symbols_in, selection=None, extra_funcs=None, **kwargs
|
|
|
1 |
+
# Fork of https://github.com/patrick-kidger/sympytorch
|
|
|
|
|
|
|
2 |
|
3 |
import collections as co
|
4 |
import functools as ft
|
5 |
|
6 |
+
import numpy as np # noqa: F401
|
7 |
import sympy
|
8 |
|
9 |
|
|
|
82 |
}
|
83 |
|
84 |
class _Node(torch.nn.Module):
|
85 |
+
"""Forked from https://github.com/patrick-kidger/sympytorch"""
|
86 |
|
87 |
def __init__(self, *, expr, _memodict, _func_lookup, **kwargs):
|
88 |
super().__init__(**kwargs)
|
|
|
114 |
self._value = int(expr)
|
115 |
self._torch_func = lambda: self._value
|
116 |
self._args = ()
|
117 |
+
elif issubclass(expr.func, sympy.NumberSymbol):
|
118 |
+
# Can get here from exp(1) or exact pi
|
119 |
+
self._value = float(expr)
|
120 |
+
self._torch_func = lambda: self._value
|
121 |
+
self._args = ()
|
122 |
elif issubclass(expr.func, sympy.Symbol):
|
123 |
self._name = expr.name
|
124 |
self._torch_func = lambda value: value
|
|
|
159 |
return self._torch_func(*args)
|
160 |
|
161 |
class _SingleSymPyModule(torch.nn.Module):
|
162 |
+
"""Forked from https://github.com/patrick-kidger/sympytorch"""
|
163 |
|
164 |
def __init__(
|
165 |
self, expression, symbols_in, selection=None, extra_funcs=None, **kwargs
|
pysr/feature_selection.py
CHANGED
@@ -1,8 +1,20 @@
|
|
1 |
"""Functions for doing feature selection during preprocessing."""
|
|
|
|
|
|
|
2 |
import numpy as np
|
|
|
|
|
|
|
|
|
3 |
|
4 |
|
5 |
-
def run_feature_selection(
|
|
|
|
|
|
|
|
|
|
|
6 |
"""
|
7 |
Find most important features.
|
8 |
|
@@ -20,11 +32,16 @@ def run_feature_selection(X, y, select_k_features, random_state=None):
|
|
20 |
selector = SelectFromModel(
|
21 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
22 |
)
|
23 |
-
return selector.get_support(indices=
|
24 |
|
25 |
|
26 |
# Function has not been removed only due to usage in module tests
|
27 |
-
def _handle_feature_selection(
|
|
|
|
|
|
|
|
|
|
|
28 |
if select_k_features is not None:
|
29 |
selection = run_feature_selection(X, y, select_k_features)
|
30 |
print(f"Using features {[variable_names[i] for i in selection]}")
|
|
|
1 |
"""Functions for doing feature selection during preprocessing."""
|
2 |
+
|
3 |
+
from typing import Optional, cast
|
4 |
+
|
5 |
import numpy as np
|
6 |
+
from numpy import ndarray
|
7 |
+
from numpy.typing import NDArray
|
8 |
+
|
9 |
+
from .utils import ArrayLike
|
10 |
|
11 |
|
12 |
+
def run_feature_selection(
|
13 |
+
X: ndarray,
|
14 |
+
y: ndarray,
|
15 |
+
select_k_features: int,
|
16 |
+
random_state: Optional[np.random.RandomState] = None,
|
17 |
+
) -> NDArray[np.bool_]:
|
18 |
"""
|
19 |
Find most important features.
|
20 |
|
|
|
32 |
selector = SelectFromModel(
|
33 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
34 |
)
|
35 |
+
return cast(NDArray[np.bool_], selector.get_support(indices=False))
|
36 |
|
37 |
|
38 |
# Function has not been removed only due to usage in module tests
|
39 |
+
def _handle_feature_selection(
|
40 |
+
X: ndarray,
|
41 |
+
select_k_features: Optional[int],
|
42 |
+
y: ndarray,
|
43 |
+
variable_names: ArrayLike[str],
|
44 |
+
):
|
45 |
if select_k_features is not None:
|
46 |
selection = run_feature_selection(X, y, select_k_features)
|
47 |
print(f"Using features {[variable_names[i] for i in selection]}")
|
pysr/julia_helpers.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1 |
"""Functions for initializing the Julia environment and installing deps."""
|
2 |
|
|
|
|
|
3 |
import numpy as np
|
4 |
from juliacall import convert as jl_convert # type: ignore
|
|
|
5 |
|
6 |
from .deprecated import init_julia, install
|
7 |
from .julia_import import jl
|
8 |
|
|
|
|
|
9 |
jl.seval("using Serialization: Serialization")
|
10 |
jl.seval("using PythonCall: PythonCall")
|
11 |
|
@@ -22,24 +27,31 @@ def _escape_filename(filename):
|
|
22 |
return str_repr
|
23 |
|
24 |
|
25 |
-
def _load_cluster_manager(cluster_manager):
|
26 |
jl.seval(f"using ClusterManagers: addprocs_{cluster_manager}")
|
27 |
return jl.seval(f"addprocs_{cluster_manager}")
|
28 |
|
29 |
|
30 |
-
def jl_array(x):
|
31 |
if x is None:
|
32 |
return None
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
|
36 |
-
def jl_serialize(obj):
|
37 |
buf = jl.IOBuffer()
|
38 |
Serialization.serialize(buf, obj)
|
39 |
return np.array(jl.take_b(buf))
|
40 |
|
41 |
|
42 |
-
def jl_deserialize(s):
|
43 |
if s is None:
|
44 |
return s
|
45 |
buf = jl.IOBuffer()
|
|
|
1 |
"""Functions for initializing the Julia environment and installing deps."""
|
2 |
|
3 |
+
from typing import Any, Callable, Union, cast
|
4 |
+
|
5 |
import numpy as np
|
6 |
from juliacall import convert as jl_convert # type: ignore
|
7 |
+
from numpy.typing import NDArray
|
8 |
|
9 |
from .deprecated import init_julia, install
|
10 |
from .julia_import import jl
|
11 |
|
12 |
+
jl_convert = cast(Callable[[Any, Any], Any], jl_convert)
|
13 |
+
|
14 |
jl.seval("using Serialization: Serialization")
|
15 |
jl.seval("using PythonCall: PythonCall")
|
16 |
|
|
|
27 |
return str_repr
|
28 |
|
29 |
|
30 |
+
def _load_cluster_manager(cluster_manager: str):
|
31 |
jl.seval(f"using ClusterManagers: addprocs_{cluster_manager}")
|
32 |
return jl.seval(f"addprocs_{cluster_manager}")
|
33 |
|
34 |
|
35 |
+
def jl_array(x, dtype=None):
|
36 |
if x is None:
|
37 |
return None
|
38 |
+
elif dtype is None:
|
39 |
+
return jl_convert(jl.Array, x)
|
40 |
+
else:
|
41 |
+
return jl_convert(jl.Array[dtype], x)
|
42 |
+
|
43 |
+
|
44 |
+
def jl_is_function(f) -> bool:
|
45 |
+
return cast(bool, jl.seval("op -> op isa Function")(f))
|
46 |
|
47 |
|
48 |
+
def jl_serialize(obj: Any) -> NDArray[np.uint8]:
|
49 |
buf = jl.IOBuffer()
|
50 |
Serialization.serialize(buf, obj)
|
51 |
return np.array(jl.take_b(buf))
|
52 |
|
53 |
|
54 |
+
def jl_deserialize(s: Union[NDArray[np.uint8], None]):
|
55 |
if s is None:
|
56 |
return s
|
57 |
buf = jl.IOBuffer()
|
pysr/julia_import.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import os
|
2 |
import sys
|
3 |
import warnings
|
|
|
|
|
4 |
|
5 |
# Check if JuliaCall is already loaded, and if so, warn the user
|
6 |
# about the relevant environment variables. If not loaded,
|
@@ -35,31 +37,17 @@ else:
|
|
35 |
os.environ[k] = os.environ.get(k, default)
|
36 |
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
41 |
|
42 |
-
|
43 |
-
autoload_extensions = os.environ.get("PYSR_AUTOLOAD_EXTENSIONS", "yes")
|
44 |
-
if autoload_extensions in {"yes", ""} and jl_version >= (1, 9, 0):
|
45 |
-
try:
|
46 |
-
get_ipython = sys.modules["IPython"].get_ipython
|
47 |
|
48 |
-
|
49 |
-
raise ImportError("console")
|
50 |
|
51 |
-
print(
|
52 |
-
"Detected Jupyter notebook. Loading juliacall extension. Set `PYSR_AUTOLOAD_EXTENSIONS=no` to disable."
|
53 |
-
)
|
54 |
|
55 |
-
|
56 |
-
get_ipython().run_line_magic("load_ext", "juliacall")
|
57 |
-
except Exception:
|
58 |
-
pass
|
59 |
-
elif autoload_extensions not in {"no", "yes", ""}:
|
60 |
-
warnings.warn(
|
61 |
-
"PYSR_AUTOLOAD_EXTENSIONS environment variable is set to something other than 'yes' or 'no' or ''."
|
62 |
-
)
|
63 |
|
64 |
jl.seval("using SymbolicRegression")
|
65 |
SymbolicRegression = jl.SymbolicRegression
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
import warnings
|
4 |
+
from types import ModuleType
|
5 |
+
from typing import cast
|
6 |
|
7 |
# Check if JuliaCall is already loaded, and if so, warn the user
|
8 |
# about the relevant environment variables. If not loaded,
|
|
|
37 |
os.environ[k] = os.environ.get(k, default)
|
38 |
|
39 |
|
40 |
+
autoload_extensions = os.environ.get("PYSR_AUTOLOAD_EXTENSIONS")
|
41 |
+
if autoload_extensions is not None:
|
42 |
+
# Deprecated; so just pass to juliacall
|
43 |
+
os.environ["PYTHON_JULIACALL_AUTOLOAD_IPYTHON_EXTENSION"] = autoload_extensions
|
44 |
|
45 |
+
from juliacall import Main as jl # type: ignore
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
jl = cast(ModuleType, jl)
|
|
|
48 |
|
|
|
|
|
|
|
49 |
|
50 |
+
jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
jl.seval("using SymbolicRegression")
|
53 |
SymbolicRegression = jl.SymbolicRegression
|
pysr/juliapkg.json
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
"packages": {
|
4 |
"SymbolicRegression": {
|
5 |
"uuid": "8254be44-1295-4e6a-a16d-46603ac705cb",
|
6 |
-
"version": "=0.24.
|
7 |
},
|
8 |
"Serialization": {
|
9 |
"uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b",
|
|
|
3 |
"packages": {
|
4 |
"SymbolicRegression": {
|
5 |
"uuid": "8254be44-1295-4e6a-a16d-46603ac705cb",
|
6 |
+
"version": "=0.24.5"
|
7 |
},
|
8 |
"Serialization": {
|
9 |
"uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b",
|
pysr/param_groupings.yml
CHANGED
@@ -14,6 +14,7 @@
|
|
14 |
- loss_function
|
15 |
- model_selection
|
16 |
- dimensional_constraint_penalty
|
|
|
17 |
- Working with Complexities:
|
18 |
- parsimony
|
19 |
- constraints
|
|
|
14 |
- loss_function
|
15 |
- model_selection
|
16 |
- dimensional_constraint_penalty
|
17 |
+
- dimensionless_constants_only
|
18 |
- Working with Complexities:
|
19 |
- parsimony
|
20 |
- constraints
|
pysr/sklearn_monkeypatch.py
CHANGED
@@ -3,8 +3,7 @@
|
|
3 |
from sklearn.utils import validation
|
4 |
|
5 |
|
6 |
-
def _ensure_no_complex_data(*args, **kwargs):
|
7 |
-
...
|
8 |
|
9 |
|
10 |
try:
|
|
|
3 |
from sklearn.utils import validation
|
4 |
|
5 |
|
6 |
+
def _ensure_no_complex_data(*args, **kwargs): ...
|
|
|
7 |
|
8 |
|
9 |
try:
|
pysr/sr.py
CHANGED
@@ -8,27 +8,31 @@ import shutil
|
|
8 |
import sys
|
9 |
import tempfile
|
10 |
import warnings
|
|
|
11 |
from datetime import datetime
|
12 |
from io import StringIO
|
13 |
from multiprocessing import cpu_count
|
14 |
from pathlib import Path
|
15 |
-
from typing import Callable, Dict, List, Optional, Tuple, Union
|
16 |
-
|
17 |
-
if sys.version_info >= (3, 8):
|
18 |
-
from typing import Literal
|
19 |
-
else:
|
20 |
-
from typing_extensions import Literal
|
21 |
|
22 |
import numpy as np
|
23 |
import pandas as pd
|
|
|
|
|
24 |
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
|
25 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
26 |
-
from sklearn.utils.validation import _check_feature_names_in
|
|
|
27 |
|
28 |
from .denoising import denoise, multi_denoise
|
29 |
from .deprecated import DEPRECATED_KWARGS
|
30 |
from .export_jax import sympy2jax
|
31 |
-
from .export_latex import
|
|
|
|
|
|
|
|
|
|
|
32 |
from .export_numpy import sympy2numpy
|
33 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
34 |
from .export_torch import sympy2torch
|
@@ -40,17 +44,21 @@ from .julia_helpers import (
|
|
40 |
_load_cluster_manager,
|
41 |
jl_array,
|
42 |
jl_deserialize,
|
|
|
43 |
jl_serialize,
|
44 |
)
|
45 |
from .julia_import import SymbolicRegression, jl
|
46 |
from .utils import (
|
|
|
|
|
47 |
_csv_filename_to_pkl_filename,
|
48 |
_preprocess_julia_floats,
|
49 |
_safe_check_feature_names_in,
|
50 |
_subscriptify,
|
|
|
51 |
)
|
52 |
|
53 |
-
|
54 |
|
55 |
|
56 |
def _process_constraints(binary_operators, unary_operators, constraints):
|
@@ -113,7 +121,7 @@ def _maybe_create_inline_operators(
|
|
113 |
"and underscores are allowed."
|
114 |
)
|
115 |
if (extra_sympy_mappings is None) or (
|
116 |
-
not
|
117 |
):
|
118 |
raise ValueError(
|
119 |
f"Custom function {function_name} is not defined in `extra_sympy_mappings`. "
|
@@ -130,6 +138,7 @@ def _check_assertions(
|
|
130 |
X,
|
131 |
use_custom_variable_names,
|
132 |
variable_names,
|
|
|
133 |
weights,
|
134 |
y,
|
135 |
X_units,
|
@@ -154,6 +163,13 @@ def _check_assertions(
|
|
154 |
"and underscores are allowed."
|
155 |
)
|
156 |
assert_valid_sympy_symbol(var_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
if X_units is not None and len(X_units) != X.shape[1]:
|
158 |
raise ValueError(
|
159 |
"The number of units in `X_units` must equal the number of features in `X`."
|
@@ -178,6 +194,21 @@ def _check_assertions(
|
|
178 |
VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
|
179 |
|
180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
182 |
"""
|
183 |
High-performance symbolic regression algorithm.
|
@@ -309,7 +340,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
309 |
`idx` argument to the function, which is `nothing`
|
310 |
for non-batched, and a 1D array of indices for batched.
|
311 |
Default is `None`.
|
312 |
-
complexity_of_operators : dict[str, float]
|
313 |
If you would like to use a complexity other than 1 for an
|
314 |
operator, specify the complexity here. For example,
|
315 |
`{"sin": 2, "+": 1}` would give a complexity of 2 for each use
|
@@ -318,16 +349,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
318 |
numbers for a complexity, and the total complexity of a tree
|
319 |
will be rounded to the nearest integer after computing.
|
320 |
Default is `None`.
|
321 |
-
complexity_of_constants : float
|
322 |
Complexity of constants. Default is `1`.
|
323 |
-
complexity_of_variables : float
|
324 |
-
|
|
|
|
|
|
|
325 |
parsimony : float
|
326 |
Multiplicative factor for how much to punish complexity.
|
327 |
Default is `0.0032`.
|
328 |
dimensional_constraint_penalty : float
|
329 |
Additive penalty for if dimensional analysis of an expression fails.
|
330 |
By default, this is `1000.0`.
|
|
|
|
|
|
|
331 |
use_frequency : bool
|
332 |
Whether to measure the frequency of complexities, and use that
|
333 |
instead of parsimony to explore equation space. Will naturally
|
@@ -603,22 +640,17 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
603 |
Units of each variable in the training dataset, `y`.
|
604 |
nout_ : int
|
605 |
Number of output dimensions.
|
606 |
-
selection_mask_ :
|
607 |
-
|
608 |
-
`select_k_features` is set.
|
609 |
tempdir_ : Path
|
610 |
Path to the temporary equations directory.
|
611 |
-
equation_file_ : str
|
612 |
Output equation file name produced by the julia backend.
|
613 |
julia_state_stream_ : ndarray
|
614 |
The serialized state for the julia SymbolicRegression.jl backend (after fitting),
|
615 |
stored as an array of uint8, produced by Julia's Serialization.serialize function.
|
616 |
-
julia_state_
|
617 |
-
The deserialized state.
|
618 |
julia_options_stream_ : ndarray
|
619 |
The serialized julia options, stored as an array of uint8,
|
620 |
-
julia_options_
|
621 |
-
The deserialized julia options.
|
622 |
equation_file_contents_ : list[pandas.DataFrame]
|
623 |
Contents of the equation file output by the Julia backend.
|
624 |
show_pickle_warnings_ : bool
|
@@ -665,6 +697,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
665 |
```
|
666 |
"""
|
667 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
668 |
def __init__(
|
669 |
self,
|
670 |
model_selection: Literal["best", "accuracy", "score"] = "best",
|
@@ -685,9 +733,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
685 |
loss_function: Optional[str] = None,
|
686 |
complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None,
|
687 |
complexity_of_constants: Union[int, float] = 1,
|
688 |
-
complexity_of_variables: Union[int, float] =
|
689 |
parsimony: float = 0.0032,
|
690 |
dimensional_constraint_penalty: Optional[float] = None,
|
|
|
691 |
use_frequency: bool = True,
|
692 |
use_frequency_in_tournament: bool = True,
|
693 |
adaptive_parsimony_scaling: float = 20.0,
|
@@ -783,6 +832,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
783 |
self.complexity_of_variables = complexity_of_variables
|
784 |
self.parsimony = parsimony
|
785 |
self.dimensional_constraint_penalty = dimensional_constraint_penalty
|
|
|
786 |
self.use_frequency = use_frequency
|
787 |
self.use_frequency_in_tournament = use_frequency_in_tournament
|
788 |
self.adaptive_parsimony_scaling = adaptive_parsimony_scaling
|
@@ -863,15 +913,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
863 |
updated_kwarg_name = DEPRECATED_KWARGS[k]
|
864 |
setattr(self, updated_kwarg_name, v)
|
865 |
warnings.warn(
|
866 |
-
f"{k} has been renamed to {updated_kwarg_name} in PySRRegressor. "
|
867 |
"Please use that instead.",
|
868 |
FutureWarning,
|
869 |
)
|
870 |
# Handle kwargs that have been moved to the fit method
|
871 |
elif k in ["weights", "variable_names", "Xresampled"]:
|
872 |
warnings.warn(
|
873 |
-
f"{k} is a data
|
874 |
-
f"Ignoring parameter; please pass {k} during the call to fit instead.",
|
875 |
FutureWarning,
|
876 |
)
|
877 |
elif k == "julia_project":
|
@@ -888,21 +938,25 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
888 |
FutureWarning,
|
889 |
)
|
890 |
else:
|
891 |
-
|
892 |
-
|
|
|
893 |
)
|
|
|
|
|
|
|
894 |
|
895 |
@classmethod
|
896 |
def from_file(
|
897 |
cls,
|
898 |
-
equation_file,
|
899 |
*,
|
900 |
-
binary_operators=None,
|
901 |
-
unary_operators=None,
|
902 |
-
n_features_in=None,
|
903 |
-
feature_names_in=None,
|
904 |
-
selection_mask=None,
|
905 |
-
nout=1,
|
906 |
verbosity=1,
|
907 |
**pysr_kwargs,
|
908 |
):
|
@@ -911,7 +965,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
911 |
|
912 |
Parameters
|
913 |
----------
|
914 |
-
equation_file : str
|
915 |
Path to a pickle file containing a saved model, or a csv file
|
916 |
containing equations.
|
917 |
binary_operators : list[str]
|
@@ -926,8 +980,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
926 |
feature_names_in : list[str]
|
927 |
Names of the features passed to the model.
|
928 |
Not needed if loading from a pickle file.
|
929 |
-
selection_mask :
|
930 |
-
If using select_k_features
|
931 |
Not needed if loading from a pickle file.
|
932 |
nout : int
|
933 |
Number of outputs of the model.
|
@@ -983,7 +1037,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
983 |
|
984 |
# TODO: copy .bkup file if exists.
|
985 |
model = cls(
|
986 |
-
equation_file=equation_file,
|
987 |
binary_operators=binary_operators,
|
988 |
unary_operators=unary_operators,
|
989 |
**pysr_kwargs,
|
@@ -1003,7 +1057,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1003 |
model.display_feature_names_in_ = feature_names_in
|
1004 |
|
1005 |
if selection_mask is None:
|
1006 |
-
model.selection_mask_ = np.ones(n_features_in, dtype=
|
1007 |
else:
|
1008 |
model.selection_mask_ = selection_mask
|
1009 |
|
@@ -1030,7 +1084,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1030 |
all_equations = equations
|
1031 |
|
1032 |
for i, equations in enumerate(all_equations):
|
1033 |
-
selected = [""
|
1034 |
chosen_row = idx_model_selection(equations, self.model_selection)
|
1035 |
selected[chosen_row] = ">>>>"
|
1036 |
repr_equations = pd.DataFrame(
|
@@ -1063,15 +1117,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1063 |
Handle pickle serialization for PySRRegressor.
|
1064 |
|
1065 |
The Scikit-learn standard requires estimators to be serializable via
|
1066 |
-
`pickle.dumps()`. However,
|
1067 |
-
|
1068 |
-
|
1069 |
-
Thus, for `PySRRegressor` to support pickle serialization, the
|
1070 |
-
`julia_state_stream_` attribute must be hidden from pickle. This will
|
1071 |
-
prevent the `warm_start` of any model that is loaded via `pickle.loads()`,
|
1072 |
-
but does allow all other attributes of a fitted `PySRRegressor` estimator
|
1073 |
-
to be serialized. Note: Jax and Torch format equations are also removed
|
1074 |
-
from the pickled instance.
|
1075 |
"""
|
1076 |
state = self.__dict__
|
1077 |
show_pickle_warning = not (
|
@@ -1137,10 +1184,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1137 |
|
1138 |
@property
|
1139 |
def julia_options_(self):
|
|
|
1140 |
return jl_deserialize(self.julia_options_stream_)
|
1141 |
|
1142 |
@property
|
1143 |
def julia_state_(self):
|
|
|
1144 |
return jl_deserialize(self.julia_state_stream_)
|
1145 |
|
1146 |
@property
|
@@ -1153,7 +1202,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1153 |
)
|
1154 |
return self.julia_state_
|
1155 |
|
1156 |
-
def get_best(self, index=None):
|
1157 |
"""
|
1158 |
Get best equation using `model_selection`.
|
1159 |
|
@@ -1176,8 +1225,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1176 |
Raised when an invalid model selection strategy is provided.
|
1177 |
"""
|
1178 |
check_is_fitted(self, attributes=["equations_"])
|
1179 |
-
if self.equations_ is None:
|
1180 |
-
raise ValueError("No equations have been generated yet.")
|
1181 |
|
1182 |
if index is not None:
|
1183 |
if isinstance(self.equations_, list):
|
@@ -1185,16 +1232,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1185 |
index, list
|
1186 |
), "With multiple output features, index must be a list."
|
1187 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
1188 |
-
|
|
|
|
|
1189 |
|
1190 |
if isinstance(self.equations_, list):
|
1191 |
return [
|
1192 |
-
eq.
|
1193 |
for eq in self.equations_
|
1194 |
]
|
1195 |
-
|
1196 |
-
|
1197 |
-
|
|
|
|
|
|
|
1198 |
|
1199 |
def _setup_equation_file(self):
|
1200 |
"""
|
@@ -1219,7 +1271,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1219 |
self.equation_file_ = self.equation_file
|
1220 |
self.equation_file_contents_ = None
|
1221 |
|
1222 |
-
def
|
1223 |
"""
|
1224 |
Ensure parameters passed at initialization are valid.
|
1225 |
|
@@ -1277,59 +1329,57 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1277 |
f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
|
1278 |
)
|
1279 |
|
1280 |
-
|
1281 |
-
|
1282 |
-
|
1283 |
-
|
1284 |
-
|
1285 |
-
|
1286 |
-
|
1287 |
-
|
1288 |
-
|
1289 |
-
|
1290 |
-
|
1291 |
-
|
1292 |
-
|
1293 |
-
|
1294 |
-
|
1295 |
-
|
1296 |
-
|
1297 |
-
parameter_value = default_value
|
1298 |
else:
|
1299 |
-
#
|
1300 |
-
|
1301 |
-
|
1302 |
-
)
|
1303 |
-
|
1304 |
-
elif parameter == "batch_size" and parameter_value < 1:
|
1305 |
-
warnings.warn(
|
1306 |
-
"Given `batch_size` must be greater than or equal to one. "
|
1307 |
-
"`batch_size` has been increased to equal one."
|
1308 |
-
)
|
1309 |
-
parameter_value = 1
|
1310 |
-
elif (
|
1311 |
-
parameter == "progress"
|
1312 |
-
and parameter_value
|
1313 |
-
and "buffer" not in sys.stdout.__dir__()
|
1314 |
-
):
|
1315 |
-
warnings.warn(
|
1316 |
-
"Note: it looks like you are running in Jupyter. "
|
1317 |
-
"The progress bar will be turned off."
|
1318 |
-
)
|
1319 |
-
parameter_value = False
|
1320 |
-
packed_modified_params[parameter] = parameter_value
|
1321 |
|
1322 |
assert (
|
1323 |
-
len(
|
1324 |
-
|
1325 |
-
|
1326 |
-
)
|
1327 |
|
1328 |
-
return
|
1329 |
|
1330 |
def _validate_and_set_fit_params(
|
1331 |
-
self,
|
1332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1333 |
"""
|
1334 |
Validate the parameters passed to the :term`fit` method.
|
1335 |
|
@@ -1349,12 +1399,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1349 |
Weight array of the same shape as `y`.
|
1350 |
Each element is how to weight the mean-square-error loss
|
1351 |
for that particular element of y.
|
1352 |
-
variable_names :
|
1353 |
-
Names of each
|
|
|
|
|
1354 |
X_units : list[str] of length n_features
|
1355 |
-
Units of each
|
1356 |
y_units : str | list[str] of length n_out
|
1357 |
-
Units of each
|
1358 |
|
1359 |
Returns
|
1360 |
-------
|
@@ -1398,6 +1450,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1398 |
"Please use valid names instead."
|
1399 |
)
|
1400 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1401 |
# Data validation and feature name fetching via sklearn
|
1402 |
# This method sets the n_features_in_ attribute
|
1403 |
if Xresampled is not None:
|
@@ -1405,7 +1473,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1405 |
if weights is not None:
|
1406 |
weights = check_array(weights, ensure_2d=False)
|
1407 |
check_consistent_length(weights, y)
|
1408 |
-
X, y = self.
|
1409 |
self.feature_names_in_ = _safe_check_feature_names_in(
|
1410 |
self, variable_names, generate_names=False
|
1411 |
)
|
@@ -1415,10 +1483,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1415 |
self.display_feature_names_in_ = np.array(
|
1416 |
[f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
1417 |
)
|
|
|
1418 |
else:
|
1419 |
self.display_feature_names_in_ = self.feature_names_in_
|
1420 |
-
|
1421 |
-
variable_names = self.feature_names_in_
|
1422 |
|
1423 |
# Handle multioutput data
|
1424 |
if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
|
@@ -1428,13 +1496,39 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1428 |
else:
|
1429 |
raise NotImplementedError("y shape not supported!")
|
1430 |
|
|
|
1431 |
self.X_units_ = copy.deepcopy(X_units)
|
1432 |
self.y_units_ = copy.deepcopy(y_units)
|
1433 |
|
1434 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1435 |
|
1436 |
def _pre_transform_training_data(
|
1437 |
-
self,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1438 |
):
|
1439 |
"""
|
1440 |
Transform the training data before fitting the symbolic regressor.
|
@@ -1443,17 +1537,19 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1443 |
|
1444 |
Parameters
|
1445 |
----------
|
1446 |
-
X : ndarray
|
1447 |
Training data of shape (n_samples, n_features).
|
1448 |
-
y : ndarray
|
1449 |
Target values of shape (n_samples,) or (n_samples, n_targets).
|
1450 |
Will be cast to X's dtype if necessary.
|
1451 |
-
Xresampled : ndarray |
|
1452 |
Resampled training data, of shape `(n_resampled, n_features)`,
|
1453 |
used for denoising.
|
1454 |
variable_names : list[str]
|
1455 |
Names of each variable in the training dataset, `X`.
|
1456 |
Of length `n_features`.
|
|
|
|
|
1457 |
X_units : list[str]
|
1458 |
Units of each variable in the training dataset, `X`.
|
1459 |
y_units : str | list[str]
|
@@ -1486,24 +1582,43 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1486 |
"""
|
1487 |
# Feature selection transformation
|
1488 |
if self.select_k_features:
|
1489 |
-
|
1490 |
X, y, self.select_k_features, random_state=random_state
|
1491 |
)
|
1492 |
-
X = X[:,
|
1493 |
|
1494 |
if Xresampled is not None:
|
1495 |
-
Xresampled = Xresampled[:,
|
1496 |
|
1497 |
# Reduce variable_names to selection
|
1498 |
-
variable_names =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1499 |
|
1500 |
if X_units is not None:
|
1501 |
-
X_units =
|
|
|
|
|
|
|
1502 |
self.X_units_ = copy.deepcopy(X_units)
|
1503 |
|
1504 |
# Re-perform data validation and feature name updating
|
1505 |
-
X, y = self.
|
1506 |
# Update feature names with selected variable names
|
|
|
1507 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
1508 |
self.display_feature_names_in_ = self.feature_names_in_
|
1509 |
print(f"Using features {self.feature_names_in_}")
|
@@ -1517,22 +1632,29 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1517 |
else:
|
1518 |
X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
1519 |
|
1520 |
-
return X, y, variable_names, X_units, y_units
|
1521 |
|
1522 |
-
def _run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1523 |
"""
|
1524 |
Run the symbolic regression fitting process on the julia backend.
|
1525 |
|
1526 |
Parameters
|
1527 |
----------
|
1528 |
-
X : ndarray
|
1529 |
Training data of shape `(n_samples, n_features)`.
|
1530 |
-
y : ndarray
|
1531 |
Target values of shape `(n_samples,)` or `(n_samples, n_targets)`.
|
1532 |
Will be cast to `X`'s dtype if necessary.
|
1533 |
-
|
1534 |
-
|
1535 |
-
weights : ndarray |
|
1536 |
Weight array of the same shape as `y`.
|
1537 |
Each element is how to weight the mean-square-error loss
|
1538 |
for that particular element of y.
|
@@ -1551,24 +1673,27 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1551 |
"""
|
1552 |
# Need to be global as we don't want to recreate/reinstate julia for
|
1553 |
# every new instance of PySRRegressor
|
1554 |
-
global
|
1555 |
|
1556 |
# These are the parameters which may be modified from the ones
|
1557 |
# specified in init, so we define them here locally:
|
1558 |
-
binary_operators =
|
1559 |
-
unary_operators =
|
1560 |
-
maxdepth =
|
1561 |
-
constraints =
|
|
|
|
|
|
|
|
|
|
|
|
|
1562 |
nested_constraints = self.nested_constraints
|
1563 |
complexity_of_operators = self.complexity_of_operators
|
1564 |
-
|
1565 |
cluster_manager = self.cluster_manager
|
1566 |
-
batch_size = mutated_params["batch_size"]
|
1567 |
-
update_verbosity = mutated_params["update_verbosity"]
|
1568 |
-
progress = mutated_params["progress"]
|
1569 |
|
1570 |
# Start julia backend processes
|
1571 |
-
if not
|
1572 |
print("Compiling Julia backend...")
|
1573 |
|
1574 |
if cluster_manager is not None:
|
@@ -1607,6 +1732,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1607 |
complexity_of_operators_str += f"({k}) => {v}, "
|
1608 |
complexity_of_operators_str += ")"
|
1609 |
complexity_of_operators = jl.seval(complexity_of_operators_str)
|
|
|
|
|
|
|
|
|
1610 |
|
1611 |
custom_loss = jl.seval(
|
1612 |
str(self.elementwise_loss)
|
@@ -1643,16 +1772,30 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1643 |
optimize=self.weight_optimize,
|
1644 |
)
|
1645 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1646 |
# Call to Julia backend.
|
1647 |
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
|
1648 |
options = SymbolicRegression.Options(
|
1649 |
-
binary_operators=
|
1650 |
-
unary_operators=
|
1651 |
bin_constraints=jl_array(bin_constraints),
|
1652 |
una_constraints=jl_array(una_constraints),
|
1653 |
complexity_of_operators=complexity_of_operators,
|
1654 |
complexity_of_constants=self.complexity_of_constants,
|
1655 |
-
complexity_of_variables=
|
1656 |
nested_constraints=nested_constraints,
|
1657 |
elementwise_loss=custom_loss,
|
1658 |
loss_function=custom_full_objective,
|
@@ -1667,6 +1810,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1667 |
# These have the same name:
|
1668 |
parsimony=self.parsimony,
|
1669 |
dimensional_constraint_penalty=self.dimensional_constraint_penalty,
|
|
|
1670 |
alpha=self.alpha,
|
1671 |
maxdepth=maxdepth,
|
1672 |
fast_cycle=self.fast_cycle,
|
@@ -1678,9 +1822,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1678 |
fraction_replaced_hof=self.fraction_replaced_hof,
|
1679 |
should_simplify=self.should_simplify,
|
1680 |
should_optimize_constants=self.should_optimize_constants,
|
1681 |
-
warmup_maxsize_by=
|
1682 |
-
0.0 if self.warmup_maxsize_by is None else self.warmup_maxsize_by
|
1683 |
-
),
|
1684 |
use_frequency=self.use_frequency,
|
1685 |
use_frequency_in_tournament=self.use_frequency_in_tournament,
|
1686 |
adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
|
@@ -1787,7 +1929,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1787 |
if self.delete_tempfiles:
|
1788 |
shutil.rmtree(self.tempdir_)
|
1789 |
|
1790 |
-
|
1791 |
|
1792 |
return self
|
1793 |
|
@@ -1797,9 +1939,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1797 |
y,
|
1798 |
Xresampled=None,
|
1799 |
weights=None,
|
1800 |
-
variable_names: Optional[
|
1801 |
-
|
1802 |
-
|
|
|
|
|
|
|
1803 |
) -> "PySRRegressor":
|
1804 |
"""
|
1805 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
@@ -1858,15 +2003,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1858 |
self.selection_mask_ = None
|
1859 |
self.julia_state_stream_ = None
|
1860 |
self.julia_options_stream_ = None
|
|
|
1861 |
self.X_units_ = None
|
1862 |
self.y_units_ = None
|
1863 |
|
1864 |
-
random_state = check_random_state(self.random_state) # For np random
|
1865 |
-
seed = random_state.get_state()[1][0] # For julia random
|
1866 |
-
|
1867 |
self._setup_equation_file()
|
1868 |
|
1869 |
-
|
1870 |
|
1871 |
(
|
1872 |
X,
|
@@ -1874,10 +2017,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1874 |
Xresampled,
|
1875 |
weights,
|
1876 |
variable_names,
|
|
|
1877 |
X_units,
|
1878 |
y_units,
|
1879 |
) = self._validate_and_set_fit_params(
|
1880 |
-
X,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1881 |
)
|
1882 |
|
1883 |
if X.shape[0] > 10000 and not self.batching:
|
@@ -1891,9 +2042,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1891 |
"More datapoints will lower the search speed."
|
1892 |
)
|
1893 |
|
|
|
|
|
|
|
1894 |
# Pre transformations (feature selection and denoising)
|
1895 |
-
X, y, variable_names, X_units, y_units =
|
1896 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1897 |
)
|
1898 |
|
1899 |
# Warn about large feature counts (still warn if feature count is large
|
@@ -1903,13 +2066,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1903 |
"Note: you are running with 10 features or more. "
|
1904 |
"Genetic algorithms like used in PySR scale poorly with large numbers of features. "
|
1905 |
"You should run PySR for more `niterations` to ensure it can find "
|
1906 |
-
"the correct variables, "
|
1907 |
-
"or, alternatively, do a dimensionality reduction beforehand. "
|
1908 |
-
"For example, `X = PCA(n_components=6).fit_transform(X)`, "
|
1909 |
-
"using scikit-learn's `PCA` class, "
|
1910 |
-
"will reduce the number of features to 6 in an interpretable way, "
|
1911 |
-
"as each resultant feature "
|
1912 |
-
"will be a linear combination of the original features. "
|
1913 |
)
|
1914 |
|
1915 |
# Assertion checks
|
@@ -1920,6 +2077,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1920 |
X,
|
1921 |
use_custom_variable_names,
|
1922 |
variable_names,
|
|
|
1923 |
weights,
|
1924 |
y,
|
1925 |
X_units,
|
@@ -1932,7 +2090,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1932 |
self._checkpoint()
|
1933 |
|
1934 |
# Perform the search:
|
1935 |
-
self._run(X, y,
|
1936 |
|
1937 |
# Then, after fit, we save again, so the pickle file contains
|
1938 |
# the equations:
|
@@ -1941,7 +2099,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1941 |
|
1942 |
return self
|
1943 |
|
1944 |
-
def refresh(self, checkpoint_file=None):
|
1945 |
"""
|
1946 |
Update self.equations_ with any new options passed.
|
1947 |
|
@@ -1950,11 +2108,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1950 |
|
1951 |
Parameters
|
1952 |
----------
|
1953 |
-
checkpoint_file : str
|
1954 |
Path to checkpoint hall of fame file to be loaded.
|
1955 |
The default will use the set `equation_file_`.
|
1956 |
"""
|
1957 |
-
if checkpoint_file:
|
1958 |
self.equation_file_ = checkpoint_file
|
1959 |
self.equation_file_contents_ = None
|
1960 |
check_is_fitted(self, attributes=["equation_file_"])
|
@@ -2006,7 +2164,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2006 |
if self.selection_mask_ is not None:
|
2007 |
# RangeIndex enforces column order allowing columns to
|
2008 |
# be correctly filtered with self.selection_mask_
|
2009 |
-
X = X.
|
2010 |
X.columns = self.feature_names_in_
|
2011 |
# Without feature information, CallableEquation/lambda_format equations
|
2012 |
# require that the column order of X matches that of the X used during
|
@@ -2016,14 +2174,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2016 |
# reordered/reindexed to match those of the transformed (denoised and
|
2017 |
# feature selected) X in fit.
|
2018 |
X = X.reindex(columns=self.feature_names_in_)
|
2019 |
-
X = self.
|
2020 |
|
2021 |
try:
|
2022 |
-
if
|
|
|
2023 |
return np.stack(
|
2024 |
[eq["lambda_format"](X) for eq in best_equation], axis=1
|
2025 |
)
|
2026 |
-
|
|
|
2027 |
except Exception as error:
|
2028 |
raise ValueError(
|
2029 |
"Failed to evaluate the expression. "
|
@@ -2053,9 +2213,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2053 |
"""
|
2054 |
self.refresh()
|
2055 |
best_equation = self.get_best(index=index)
|
2056 |
-
if
|
|
|
2057 |
return [eq["sympy_format"] for eq in best_equation]
|
2058 |
-
|
|
|
2059 |
|
2060 |
def latex(self, index=None, precision=3):
|
2061 |
"""
|
@@ -2115,9 +2277,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2115 |
self.set_params(output_jax_format=True)
|
2116 |
self.refresh()
|
2117 |
best_equation = self.get_best(index=index)
|
2118 |
-
if
|
|
|
2119 |
return [eq["jax_format"] for eq in best_equation]
|
2120 |
-
|
|
|
2121 |
|
2122 |
def pytorch(self, index=None):
|
2123 |
"""
|
@@ -2145,9 +2309,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2145 |
self.set_params(output_torch_format=True)
|
2146 |
self.refresh()
|
2147 |
best_equation = self.get_best(index=index)
|
2148 |
-
if
|
2149 |
return [eq["torch_format"] for eq in best_equation]
|
2150 |
-
|
|
|
2151 |
|
2152 |
def _read_equation_file(self):
|
2153 |
"""Read the hall of fame file created by `SymbolicRegression.jl`."""
|
@@ -2246,10 +2411,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2246 |
lastComplexity = 0
|
2247 |
sympy_format = []
|
2248 |
lambda_format = []
|
2249 |
-
|
2250 |
-
|
2251 |
-
if self.output_torch_format:
|
2252 |
-
torch_format = []
|
2253 |
|
2254 |
for _, eqn_row in output.iterrows():
|
2255 |
eqn = pysr2sympy(
|
@@ -2361,7 +2524,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2361 |
"""
|
2362 |
self.refresh()
|
2363 |
|
2364 |
-
if self.
|
2365 |
if indices is not None:
|
2366 |
assert isinstance(indices, list)
|
2367 |
assert isinstance(indices[0], list)
|
@@ -2370,7 +2533,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2370 |
table_string = sympy2multilatextable(
|
2371 |
self.equations_, indices=indices, precision=precision, columns=columns
|
2372 |
)
|
2373 |
-
|
2374 |
if indices is not None:
|
2375 |
assert isinstance(indices, list)
|
2376 |
assert isinstance(indices[0], int)
|
@@ -2378,15 +2541,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2378 |
table_string = sympy2latextable(
|
2379 |
self.equations_, indices=indices, precision=precision, columns=columns
|
2380 |
)
|
|
|
|
|
|
|
|
|
|
|
2381 |
|
2382 |
-
|
2383 |
-
r"\usepackage{breqn}",
|
2384 |
-
r"\usepackage{booktabs}",
|
2385 |
-
"",
|
2386 |
-
"...",
|
2387 |
-
"",
|
2388 |
-
]
|
2389 |
-
return "\n".join(preamble_string + [table_string])
|
2390 |
|
2391 |
|
2392 |
def idx_model_selection(equations: pd.DataFrame, model_selection: str):
|
@@ -2404,3 +2565,30 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str):
|
|
2404 |
f"{model_selection} is not a valid model selection strategy."
|
2405 |
)
|
2406 |
return chosen_idx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import sys
|
9 |
import tempfile
|
10 |
import warnings
|
11 |
+
from dataclasses import dataclass, fields
|
12 |
from datetime import datetime
|
13 |
from io import StringIO
|
14 |
from multiprocessing import cpu_count
|
15 |
from pathlib import Path
|
16 |
+
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, cast
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
import numpy as np
|
19 |
import pandas as pd
|
20 |
+
from numpy import ndarray
|
21 |
+
from numpy.typing import NDArray
|
22 |
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
|
23 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
24 |
+
from sklearn.utils.validation import _check_feature_names_in # type: ignore
|
25 |
+
from sklearn.utils.validation import check_is_fitted
|
26 |
|
27 |
from .denoising import denoise, multi_denoise
|
28 |
from .deprecated import DEPRECATED_KWARGS
|
29 |
from .export_jax import sympy2jax
|
30 |
+
from .export_latex import (
|
31 |
+
sympy2latex,
|
32 |
+
sympy2latextable,
|
33 |
+
sympy2multilatextable,
|
34 |
+
with_preamble,
|
35 |
+
)
|
36 |
from .export_numpy import sympy2numpy
|
37 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
38 |
from .export_torch import sympy2torch
|
|
|
44 |
_load_cluster_manager,
|
45 |
jl_array,
|
46 |
jl_deserialize,
|
47 |
+
jl_is_function,
|
48 |
jl_serialize,
|
49 |
)
|
50 |
from .julia_import import SymbolicRegression, jl
|
51 |
from .utils import (
|
52 |
+
ArrayLike,
|
53 |
+
PathLike,
|
54 |
_csv_filename_to_pkl_filename,
|
55 |
_preprocess_julia_floats,
|
56 |
_safe_check_feature_names_in,
|
57 |
_subscriptify,
|
58 |
+
_suggest_keywords,
|
59 |
)
|
60 |
|
61 |
+
ALREADY_RAN = False
|
62 |
|
63 |
|
64 |
def _process_constraints(binary_operators, unary_operators, constraints):
|
|
|
121 |
"and underscores are allowed."
|
122 |
)
|
123 |
if (extra_sympy_mappings is None) or (
|
124 |
+
function_name not in extra_sympy_mappings
|
125 |
):
|
126 |
raise ValueError(
|
127 |
f"Custom function {function_name} is not defined in `extra_sympy_mappings`. "
|
|
|
138 |
X,
|
139 |
use_custom_variable_names,
|
140 |
variable_names,
|
141 |
+
complexity_of_variables,
|
142 |
weights,
|
143 |
y,
|
144 |
X_units,
|
|
|
163 |
"and underscores are allowed."
|
164 |
)
|
165 |
assert_valid_sympy_symbol(var_name)
|
166 |
+
if (
|
167 |
+
isinstance(complexity_of_variables, list)
|
168 |
+
and len(complexity_of_variables) != X.shape[1]
|
169 |
+
):
|
170 |
+
raise ValueError(
|
171 |
+
"The number of elements in `complexity_of_variables` must equal the number of features in `X`."
|
172 |
+
)
|
173 |
if X_units is not None and len(X_units) != X.shape[1]:
|
174 |
raise ValueError(
|
175 |
"The number of units in `X_units` must equal the number of features in `X`."
|
|
|
194 |
VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
|
195 |
|
196 |
|
197 |
+
@dataclass
|
198 |
+
class _DynamicallySetParams:
|
199 |
+
"""Defines some parameters that are set at runtime."""
|
200 |
+
|
201 |
+
binary_operators: List[str]
|
202 |
+
unary_operators: List[str]
|
203 |
+
maxdepth: int
|
204 |
+
constraints: Dict[str, str]
|
205 |
+
multithreading: bool
|
206 |
+
batch_size: int
|
207 |
+
update_verbosity: int
|
208 |
+
progress: bool
|
209 |
+
warmup_maxsize_by: float
|
210 |
+
|
211 |
+
|
212 |
class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
213 |
"""
|
214 |
High-performance symbolic regression algorithm.
|
|
|
340 |
`idx` argument to the function, which is `nothing`
|
341 |
for non-batched, and a 1D array of indices for batched.
|
342 |
Default is `None`.
|
343 |
+
complexity_of_operators : dict[str, Union[int, float]]
|
344 |
If you would like to use a complexity other than 1 for an
|
345 |
operator, specify the complexity here. For example,
|
346 |
`{"sin": 2, "+": 1}` would give a complexity of 2 for each use
|
|
|
349 |
numbers for a complexity, and the total complexity of a tree
|
350 |
will be rounded to the nearest integer after computing.
|
351 |
Default is `None`.
|
352 |
+
complexity_of_constants : int | float
|
353 |
Complexity of constants. Default is `1`.
|
354 |
+
complexity_of_variables : int | float
|
355 |
+
Global complexity of variables. To set different complexities for
|
356 |
+
different variables, pass a list of complexities to the `fit` method
|
357 |
+
with keyword `complexity_of_variables`. You cannot use both.
|
358 |
+
Default is `1`.
|
359 |
parsimony : float
|
360 |
Multiplicative factor for how much to punish complexity.
|
361 |
Default is `0.0032`.
|
362 |
dimensional_constraint_penalty : float
|
363 |
Additive penalty for if dimensional analysis of an expression fails.
|
364 |
By default, this is `1000.0`.
|
365 |
+
dimensionless_constants_only : bool
|
366 |
+
Whether to only search for dimensionless constants, if using units.
|
367 |
+
Default is `False`.
|
368 |
use_frequency : bool
|
369 |
Whether to measure the frequency of complexities, and use that
|
370 |
instead of parsimony to explore equation space. Will naturally
|
|
|
640 |
Units of each variable in the training dataset, `y`.
|
641 |
nout_ : int
|
642 |
Number of output dimensions.
|
643 |
+
selection_mask_ : ndarray of shape (`n_features_in_`,)
|
644 |
+
Mask of which features of `X` to use when `select_k_features` is set.
|
|
|
645 |
tempdir_ : Path
|
646 |
Path to the temporary equations directory.
|
647 |
+
equation_file_ : Union[str, Path]
|
648 |
Output equation file name produced by the julia backend.
|
649 |
julia_state_stream_ : ndarray
|
650 |
The serialized state for the julia SymbolicRegression.jl backend (after fitting),
|
651 |
stored as an array of uint8, produced by Julia's Serialization.serialize function.
|
|
|
|
|
652 |
julia_options_stream_ : ndarray
|
653 |
The serialized julia options, stored as an array of uint8,
|
|
|
|
|
654 |
equation_file_contents_ : list[pandas.DataFrame]
|
655 |
Contents of the equation file output by the Julia backend.
|
656 |
show_pickle_warnings_ : bool
|
|
|
697 |
```
|
698 |
"""
|
699 |
|
700 |
+
equations_: Union[pd.DataFrame, List[pd.DataFrame], None]
|
701 |
+
n_features_in_: int
|
702 |
+
feature_names_in_: ArrayLike[str]
|
703 |
+
display_feature_names_in_: ArrayLike[str]
|
704 |
+
complexity_of_variables_: Union[int, float, List[Union[int, float]], None]
|
705 |
+
X_units_: Union[ArrayLike[str], None]
|
706 |
+
y_units_: Union[str, ArrayLike[str], None]
|
707 |
+
nout_: int
|
708 |
+
selection_mask_: Union[NDArray[np.bool_], None]
|
709 |
+
tempdir_: Path
|
710 |
+
equation_file_: PathLike
|
711 |
+
julia_state_stream_: Union[NDArray[np.uint8], None]
|
712 |
+
julia_options_stream_: Union[NDArray[np.uint8], None]
|
713 |
+
equation_file_contents_: Union[List[pd.DataFrame], None]
|
714 |
+
show_pickle_warnings_: bool
|
715 |
+
|
716 |
def __init__(
|
717 |
self,
|
718 |
model_selection: Literal["best", "accuracy", "score"] = "best",
|
|
|
733 |
loss_function: Optional[str] = None,
|
734 |
complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None,
|
735 |
complexity_of_constants: Union[int, float] = 1,
|
736 |
+
complexity_of_variables: Optional[Union[int, float]] = None,
|
737 |
parsimony: float = 0.0032,
|
738 |
dimensional_constraint_penalty: Optional[float] = None,
|
739 |
+
dimensionless_constants_only: bool = False,
|
740 |
use_frequency: bool = True,
|
741 |
use_frequency_in_tournament: bool = True,
|
742 |
adaptive_parsimony_scaling: float = 20.0,
|
|
|
832 |
self.complexity_of_variables = complexity_of_variables
|
833 |
self.parsimony = parsimony
|
834 |
self.dimensional_constraint_penalty = dimensional_constraint_penalty
|
835 |
+
self.dimensionless_constants_only = dimensionless_constants_only
|
836 |
self.use_frequency = use_frequency
|
837 |
self.use_frequency_in_tournament = use_frequency_in_tournament
|
838 |
self.adaptive_parsimony_scaling = adaptive_parsimony_scaling
|
|
|
913 |
updated_kwarg_name = DEPRECATED_KWARGS[k]
|
914 |
setattr(self, updated_kwarg_name, v)
|
915 |
warnings.warn(
|
916 |
+
f"`{k}` has been renamed to `{updated_kwarg_name}` in PySRRegressor. "
|
917 |
"Please use that instead.",
|
918 |
FutureWarning,
|
919 |
)
|
920 |
# Handle kwargs that have been moved to the fit method
|
921 |
elif k in ["weights", "variable_names", "Xresampled"]:
|
922 |
warnings.warn(
|
923 |
+
f"`{k}` is a data-dependent parameter and should be passed when fit is called. "
|
924 |
+
f"Ignoring parameter; please pass `{k}` during the call to fit instead.",
|
925 |
FutureWarning,
|
926 |
)
|
927 |
elif k == "julia_project":
|
|
|
938 |
FutureWarning,
|
939 |
)
|
940 |
else:
|
941 |
+
suggested_keywords = _suggest_keywords(PySRRegressor, k)
|
942 |
+
err_msg = (
|
943 |
+
f"`{k}` is not a valid keyword argument for PySRRegressor."
|
944 |
)
|
945 |
+
if len(suggested_keywords) > 0:
|
946 |
+
err_msg += f" Did you mean {', '.join(map(lambda s: f'`{s}`', suggested_keywords))}?"
|
947 |
+
raise TypeError(err_msg)
|
948 |
|
949 |
@classmethod
|
950 |
def from_file(
|
951 |
cls,
|
952 |
+
equation_file: PathLike,
|
953 |
*,
|
954 |
+
binary_operators: Optional[List[str]] = None,
|
955 |
+
unary_operators: Optional[List[str]] = None,
|
956 |
+
n_features_in: Optional[int] = None,
|
957 |
+
feature_names_in: Optional[ArrayLike[str]] = None,
|
958 |
+
selection_mask: Optional[NDArray[np.bool_]] = None,
|
959 |
+
nout: int = 1,
|
960 |
verbosity=1,
|
961 |
**pysr_kwargs,
|
962 |
):
|
|
|
965 |
|
966 |
Parameters
|
967 |
----------
|
968 |
+
equation_file : str or Path
|
969 |
Path to a pickle file containing a saved model, or a csv file
|
970 |
containing equations.
|
971 |
binary_operators : list[str]
|
|
|
980 |
feature_names_in : list[str]
|
981 |
Names of the features passed to the model.
|
982 |
Not needed if loading from a pickle file.
|
983 |
+
selection_mask : NDArray[np.bool_]
|
984 |
+
If using `select_k_features`, you must pass `model.selection_mask_` here.
|
985 |
Not needed if loading from a pickle file.
|
986 |
nout : int
|
987 |
Number of outputs of the model.
|
|
|
1037 |
|
1038 |
# TODO: copy .bkup file if exists.
|
1039 |
model = cls(
|
1040 |
+
equation_file=str(equation_file),
|
1041 |
binary_operators=binary_operators,
|
1042 |
unary_operators=unary_operators,
|
1043 |
**pysr_kwargs,
|
|
|
1057 |
model.display_feature_names_in_ = feature_names_in
|
1058 |
|
1059 |
if selection_mask is None:
|
1060 |
+
model.selection_mask_ = np.ones(n_features_in, dtype=np.bool_)
|
1061 |
else:
|
1062 |
model.selection_mask_ = selection_mask
|
1063 |
|
|
|
1084 |
all_equations = equations
|
1085 |
|
1086 |
for i, equations in enumerate(all_equations):
|
1087 |
+
selected = pd.Series([""] * len(equations), index=equations.index)
|
1088 |
chosen_row = idx_model_selection(equations, self.model_selection)
|
1089 |
selected[chosen_row] = ">>>>"
|
1090 |
repr_equations = pd.DataFrame(
|
|
|
1117 |
Handle pickle serialization for PySRRegressor.
|
1118 |
|
1119 |
The Scikit-learn standard requires estimators to be serializable via
|
1120 |
+
`pickle.dumps()`. However, some attributes do not support pickling
|
1121 |
+
and need to be hidden, such as the JAX and Torch representations.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1122 |
"""
|
1123 |
state = self.__dict__
|
1124 |
show_pickle_warning = not (
|
|
|
1184 |
|
1185 |
@property
|
1186 |
def julia_options_(self):
|
1187 |
+
"""The deserialized julia options."""
|
1188 |
return jl_deserialize(self.julia_options_stream_)
|
1189 |
|
1190 |
@property
|
1191 |
def julia_state_(self):
|
1192 |
+
"""The deserialized state."""
|
1193 |
return jl_deserialize(self.julia_state_stream_)
|
1194 |
|
1195 |
@property
|
|
|
1202 |
)
|
1203 |
return self.julia_state_
|
1204 |
|
1205 |
+
def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]:
|
1206 |
"""
|
1207 |
Get best equation using `model_selection`.
|
1208 |
|
|
|
1225 |
Raised when an invalid model selection strategy is provided.
|
1226 |
"""
|
1227 |
check_is_fitted(self, attributes=["equations_"])
|
|
|
|
|
1228 |
|
1229 |
if index is not None:
|
1230 |
if isinstance(self.equations_, list):
|
|
|
1232 |
index, list
|
1233 |
), "With multiple output features, index must be a list."
|
1234 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
1235 |
+
else:
|
1236 |
+
equations_ = cast(pd.DataFrame, self.equations_)
|
1237 |
+
return cast(pd.Series, equations_.iloc[index])
|
1238 |
|
1239 |
if isinstance(self.equations_, list):
|
1240 |
return [
|
1241 |
+
cast(pd.Series, eq.loc[idx_model_selection(eq, self.model_selection)])
|
1242 |
for eq in self.equations_
|
1243 |
]
|
1244 |
+
else:
|
1245 |
+
equations_ = cast(pd.DataFrame, self.equations_)
|
1246 |
+
return cast(
|
1247 |
+
pd.Series,
|
1248 |
+
equations_.loc[idx_model_selection(equations_, self.model_selection)],
|
1249 |
+
)
|
1250 |
|
1251 |
def _setup_equation_file(self):
|
1252 |
"""
|
|
|
1271 |
self.equation_file_ = self.equation_file
|
1272 |
self.equation_file_contents_ = None
|
1273 |
|
1274 |
+
def _validate_and_modify_params(self) -> _DynamicallySetParams:
|
1275 |
"""
|
1276 |
Ensure parameters passed at initialization are valid.
|
1277 |
|
|
|
1329 |
f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
|
1330 |
)
|
1331 |
|
1332 |
+
param_container = _DynamicallySetParams(
|
1333 |
+
binary_operators=["+", "*", "-", "/"],
|
1334 |
+
unary_operators=[],
|
1335 |
+
maxdepth=self.maxsize,
|
1336 |
+
constraints={},
|
1337 |
+
multithreading=self.procs != 0 and self.cluster_manager is None,
|
1338 |
+
batch_size=1,
|
1339 |
+
update_verbosity=int(self.verbosity),
|
1340 |
+
progress=self.progress,
|
1341 |
+
warmup_maxsize_by=0.0,
|
1342 |
+
)
|
1343 |
+
|
1344 |
+
for param_name in map(lambda x: x.name, fields(_DynamicallySetParams)):
|
1345 |
+
user_param_value = getattr(self, param_name)
|
1346 |
+
if user_param_value is None:
|
1347 |
+
# Leave as the default in DynamicallySetParams
|
1348 |
+
...
|
|
|
1349 |
else:
|
1350 |
+
# If user has specified it, we will override the default.
|
1351 |
+
# However, there are some special cases to mutate it:
|
1352 |
+
new_param_value = _mutate_parameter(param_name, user_param_value)
|
1353 |
+
setattr(param_container, param_name, new_param_value)
|
1354 |
+
# TODO: This should just be part of the __init__ of _DynamicallySetParams
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1355 |
|
1356 |
assert (
|
1357 |
+
len(param_container.binary_operators) > 0
|
1358 |
+
or len(param_container.unary_operators) > 0
|
1359 |
+
), "At least one operator must be provided."
|
|
|
1360 |
|
1361 |
+
return param_container
|
1362 |
|
1363 |
def _validate_and_set_fit_params(
|
1364 |
+
self,
|
1365 |
+
X,
|
1366 |
+
y,
|
1367 |
+
Xresampled,
|
1368 |
+
weights,
|
1369 |
+
variable_names,
|
1370 |
+
complexity_of_variables,
|
1371 |
+
X_units,
|
1372 |
+
y_units,
|
1373 |
+
) -> Tuple[
|
1374 |
+
ndarray,
|
1375 |
+
ndarray,
|
1376 |
+
Optional[ndarray],
|
1377 |
+
Optional[ndarray],
|
1378 |
+
ArrayLike[str],
|
1379 |
+
Union[int, float, List[Union[int, float]]],
|
1380 |
+
Optional[ArrayLike[str]],
|
1381 |
+
Optional[Union[str, ArrayLike[str]]],
|
1382 |
+
]:
|
1383 |
"""
|
1384 |
Validate the parameters passed to the :term`fit` method.
|
1385 |
|
|
|
1399 |
Weight array of the same shape as `y`.
|
1400 |
Each element is how to weight the mean-square-error loss
|
1401 |
for that particular element of y.
|
1402 |
+
variable_names : ndarray of length n_features
|
1403 |
+
Names of each feature in the training dataset, `X`.
|
1404 |
+
complexity_of_variables : int | float | list[int | float]
|
1405 |
+
Complexity of each feature in the training dataset, `X`.
|
1406 |
X_units : list[str] of length n_features
|
1407 |
+
Units of each feature in the training dataset, `X`.
|
1408 |
y_units : str | list[str] of length n_out
|
1409 |
+
Units of each feature in the training dataset, `y`.
|
1410 |
|
1411 |
Returns
|
1412 |
-------
|
|
|
1450 |
"Please use valid names instead."
|
1451 |
)
|
1452 |
|
1453 |
+
if (
|
1454 |
+
complexity_of_variables is not None
|
1455 |
+
and self.complexity_of_variables is not None
|
1456 |
+
):
|
1457 |
+
raise ValueError(
|
1458 |
+
"You cannot set `complexity_of_variables` at both `fit` and `__init__`. "
|
1459 |
+
"Pass it at `__init__` to set it to global default, OR use `fit` to set it for "
|
1460 |
+
"each variable individually."
|
1461 |
+
)
|
1462 |
+
elif complexity_of_variables is not None:
|
1463 |
+
complexity_of_variables = complexity_of_variables
|
1464 |
+
elif self.complexity_of_variables is not None:
|
1465 |
+
complexity_of_variables = self.complexity_of_variables
|
1466 |
+
else:
|
1467 |
+
complexity_of_variables = 1
|
1468 |
+
|
1469 |
# Data validation and feature name fetching via sklearn
|
1470 |
# This method sets the n_features_in_ attribute
|
1471 |
if Xresampled is not None:
|
|
|
1473 |
if weights is not None:
|
1474 |
weights = check_array(weights, ensure_2d=False)
|
1475 |
check_consistent_length(weights, y)
|
1476 |
+
X, y = self._validate_data_X_y(X, y)
|
1477 |
self.feature_names_in_ = _safe_check_feature_names_in(
|
1478 |
self, variable_names, generate_names=False
|
1479 |
)
|
|
|
1483 |
self.display_feature_names_in_ = np.array(
|
1484 |
[f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
1485 |
)
|
1486 |
+
variable_names = self.feature_names_in_
|
1487 |
else:
|
1488 |
self.display_feature_names_in_ = self.feature_names_in_
|
1489 |
+
variable_names = self.feature_names_in_
|
|
|
1490 |
|
1491 |
# Handle multioutput data
|
1492 |
if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
|
|
|
1496 |
else:
|
1497 |
raise NotImplementedError("y shape not supported!")
|
1498 |
|
1499 |
+
self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables)
|
1500 |
self.X_units_ = copy.deepcopy(X_units)
|
1501 |
self.y_units_ = copy.deepcopy(y_units)
|
1502 |
|
1503 |
+
return (
|
1504 |
+
X,
|
1505 |
+
y,
|
1506 |
+
Xresampled,
|
1507 |
+
weights,
|
1508 |
+
variable_names,
|
1509 |
+
complexity_of_variables,
|
1510 |
+
X_units,
|
1511 |
+
y_units,
|
1512 |
+
)
|
1513 |
+
|
1514 |
+
def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
|
1515 |
+
raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
|
1516 |
+
return cast(Tuple[ndarray, ndarray], raw_out)
|
1517 |
+
|
1518 |
+
def _validate_data_X(self, X) -> Tuple[ndarray]:
|
1519 |
+
raw_out = self._validate_data(X=X, reset=False) # type: ignore
|
1520 |
+
return cast(Tuple[ndarray], raw_out)
|
1521 |
|
1522 |
def _pre_transform_training_data(
|
1523 |
+
self,
|
1524 |
+
X: ndarray,
|
1525 |
+
y: ndarray,
|
1526 |
+
Xresampled: Union[ndarray, None],
|
1527 |
+
variable_names: ArrayLike[str],
|
1528 |
+
complexity_of_variables: Union[int, float, List[Union[int, float]]],
|
1529 |
+
X_units: Union[ArrayLike[str], None],
|
1530 |
+
y_units: Union[ArrayLike[str], str, None],
|
1531 |
+
random_state: np.random.RandomState,
|
1532 |
):
|
1533 |
"""
|
1534 |
Transform the training data before fitting the symbolic regressor.
|
|
|
1537 |
|
1538 |
Parameters
|
1539 |
----------
|
1540 |
+
X : ndarray
|
1541 |
Training data of shape (n_samples, n_features).
|
1542 |
+
y : ndarray
|
1543 |
Target values of shape (n_samples,) or (n_samples, n_targets).
|
1544 |
Will be cast to X's dtype if necessary.
|
1545 |
+
Xresampled : ndarray | None
|
1546 |
Resampled training data, of shape `(n_resampled, n_features)`,
|
1547 |
used for denoising.
|
1548 |
variable_names : list[str]
|
1549 |
Names of each variable in the training dataset, `X`.
|
1550 |
Of length `n_features`.
|
1551 |
+
complexity_of_variables : int | float | list[int | float]
|
1552 |
+
Complexity of each variable in the training dataset, `X`.
|
1553 |
X_units : list[str]
|
1554 |
Units of each variable in the training dataset, `X`.
|
1555 |
y_units : str | list[str]
|
|
|
1582 |
"""
|
1583 |
# Feature selection transformation
|
1584 |
if self.select_k_features:
|
1585 |
+
selection_mask = run_feature_selection(
|
1586 |
X, y, self.select_k_features, random_state=random_state
|
1587 |
)
|
1588 |
+
X = X[:, selection_mask]
|
1589 |
|
1590 |
if Xresampled is not None:
|
1591 |
+
Xresampled = Xresampled[:, selection_mask]
|
1592 |
|
1593 |
# Reduce variable_names to selection
|
1594 |
+
variable_names = cast(
|
1595 |
+
ArrayLike[str],
|
1596 |
+
[
|
1597 |
+
variable_names[i]
|
1598 |
+
for i in range(len(variable_names))
|
1599 |
+
if selection_mask[i]
|
1600 |
+
],
|
1601 |
+
)
|
1602 |
+
|
1603 |
+
if isinstance(complexity_of_variables, list):
|
1604 |
+
complexity_of_variables = [
|
1605 |
+
complexity_of_variables[i]
|
1606 |
+
for i in range(len(complexity_of_variables))
|
1607 |
+
if selection_mask[i]
|
1608 |
+
]
|
1609 |
+
self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables)
|
1610 |
|
1611 |
if X_units is not None:
|
1612 |
+
X_units = cast(
|
1613 |
+
ArrayLike[str],
|
1614 |
+
[X_units[i] for i in range(len(X_units)) if selection_mask[i]],
|
1615 |
+
)
|
1616 |
self.X_units_ = copy.deepcopy(X_units)
|
1617 |
|
1618 |
# Re-perform data validation and feature name updating
|
1619 |
+
X, y = self._validate_data_X_y(X, y)
|
1620 |
# Update feature names with selected variable names
|
1621 |
+
self.selection_mask_ = selection_mask
|
1622 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
1623 |
self.display_feature_names_in_ = self.feature_names_in_
|
1624 |
print(f"Using features {self.feature_names_in_}")
|
|
|
1632 |
else:
|
1633 |
X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
1634 |
|
1635 |
+
return X, y, variable_names, complexity_of_variables, X_units, y_units
|
1636 |
|
1637 |
+
def _run(
|
1638 |
+
self,
|
1639 |
+
X: ndarray,
|
1640 |
+
y: ndarray,
|
1641 |
+
runtime_params: _DynamicallySetParams,
|
1642 |
+
weights: Optional[ndarray],
|
1643 |
+
seed: int,
|
1644 |
+
):
|
1645 |
"""
|
1646 |
Run the symbolic regression fitting process on the julia backend.
|
1647 |
|
1648 |
Parameters
|
1649 |
----------
|
1650 |
+
X : ndarray
|
1651 |
Training data of shape `(n_samples, n_features)`.
|
1652 |
+
y : ndarray
|
1653 |
Target values of shape `(n_samples,)` or `(n_samples, n_targets)`.
|
1654 |
Will be cast to `X`'s dtype if necessary.
|
1655 |
+
runtime_params : DynamicallySetParams
|
1656 |
+
Dynamically set versions of some parameters passed in __init__.
|
1657 |
+
weights : ndarray | None
|
1658 |
Weight array of the same shape as `y`.
|
1659 |
Each element is how to weight the mean-square-error loss
|
1660 |
for that particular element of y.
|
|
|
1673 |
"""
|
1674 |
# Need to be global as we don't want to recreate/reinstate julia for
|
1675 |
# every new instance of PySRRegressor
|
1676 |
+
global ALREADY_RAN
|
1677 |
|
1678 |
# These are the parameters which may be modified from the ones
|
1679 |
# specified in init, so we define them here locally:
|
1680 |
+
binary_operators = runtime_params.binary_operators
|
1681 |
+
unary_operators = runtime_params.unary_operators
|
1682 |
+
maxdepth = runtime_params.maxdepth
|
1683 |
+
constraints = runtime_params.constraints
|
1684 |
+
multithreading = runtime_params.multithreading
|
1685 |
+
batch_size = runtime_params.batch_size
|
1686 |
+
update_verbosity = runtime_params.update_verbosity
|
1687 |
+
progress = runtime_params.progress
|
1688 |
+
warmup_maxsize_by = runtime_params.warmup_maxsize_by
|
1689 |
+
|
1690 |
nested_constraints = self.nested_constraints
|
1691 |
complexity_of_operators = self.complexity_of_operators
|
1692 |
+
complexity_of_variables = self.complexity_of_variables_
|
1693 |
cluster_manager = self.cluster_manager
|
|
|
|
|
|
|
1694 |
|
1695 |
# Start julia backend processes
|
1696 |
+
if not ALREADY_RAN and update_verbosity != 0:
|
1697 |
print("Compiling Julia backend...")
|
1698 |
|
1699 |
if cluster_manager is not None:
|
|
|
1732 |
complexity_of_operators_str += f"({k}) => {v}, "
|
1733 |
complexity_of_operators_str += ")"
|
1734 |
complexity_of_operators = jl.seval(complexity_of_operators_str)
|
1735 |
+
# TODO: Refactor this into helper function
|
1736 |
+
|
1737 |
+
if isinstance(complexity_of_variables, list):
|
1738 |
+
complexity_of_variables = jl_array(complexity_of_variables)
|
1739 |
|
1740 |
custom_loss = jl.seval(
|
1741 |
str(self.elementwise_loss)
|
|
|
1772 |
optimize=self.weight_optimize,
|
1773 |
)
|
1774 |
|
1775 |
+
jl_binary_operators: List[Any] = []
|
1776 |
+
jl_unary_operators: List[Any] = []
|
1777 |
+
for input_list, output_list, name in [
|
1778 |
+
(binary_operators, jl_binary_operators, "binary"),
|
1779 |
+
(unary_operators, jl_unary_operators, "unary"),
|
1780 |
+
]:
|
1781 |
+
for op in input_list:
|
1782 |
+
jl_op = jl.seval(op)
|
1783 |
+
if not jl_is_function(jl_op):
|
1784 |
+
raise ValueError(
|
1785 |
+
f"When building `{name}_operators`, `'{op}'` did not return a Julia function"
|
1786 |
+
)
|
1787 |
+
output_list.append(jl_op)
|
1788 |
+
|
1789 |
# Call to Julia backend.
|
1790 |
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
|
1791 |
options = SymbolicRegression.Options(
|
1792 |
+
binary_operators=jl_array(jl_binary_operators, dtype=jl.Function),
|
1793 |
+
unary_operators=jl_array(jl_unary_operators, dtype=jl.Function),
|
1794 |
bin_constraints=jl_array(bin_constraints),
|
1795 |
una_constraints=jl_array(una_constraints),
|
1796 |
complexity_of_operators=complexity_of_operators,
|
1797 |
complexity_of_constants=self.complexity_of_constants,
|
1798 |
+
complexity_of_variables=complexity_of_variables,
|
1799 |
nested_constraints=nested_constraints,
|
1800 |
elementwise_loss=custom_loss,
|
1801 |
loss_function=custom_full_objective,
|
|
|
1810 |
# These have the same name:
|
1811 |
parsimony=self.parsimony,
|
1812 |
dimensional_constraint_penalty=self.dimensional_constraint_penalty,
|
1813 |
+
dimensionless_constants_only=self.dimensionless_constants_only,
|
1814 |
alpha=self.alpha,
|
1815 |
maxdepth=maxdepth,
|
1816 |
fast_cycle=self.fast_cycle,
|
|
|
1822 |
fraction_replaced_hof=self.fraction_replaced_hof,
|
1823 |
should_simplify=self.should_simplify,
|
1824 |
should_optimize_constants=self.should_optimize_constants,
|
1825 |
+
warmup_maxsize_by=warmup_maxsize_by,
|
|
|
|
|
1826 |
use_frequency=self.use_frequency,
|
1827 |
use_frequency_in_tournament=self.use_frequency_in_tournament,
|
1828 |
adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
|
|
|
1929 |
if self.delete_tempfiles:
|
1930 |
shutil.rmtree(self.tempdir_)
|
1931 |
|
1932 |
+
ALREADY_RAN = True
|
1933 |
|
1934 |
return self
|
1935 |
|
|
|
1939 |
y,
|
1940 |
Xresampled=None,
|
1941 |
weights=None,
|
1942 |
+
variable_names: Optional[ArrayLike[str]] = None,
|
1943 |
+
complexity_of_variables: Optional[
|
1944 |
+
Union[int, float, List[Union[int, float]]]
|
1945 |
+
] = None,
|
1946 |
+
X_units: Optional[ArrayLike[str]] = None,
|
1947 |
+
y_units: Optional[Union[str, ArrayLike[str]]] = None,
|
1948 |
) -> "PySRRegressor":
|
1949 |
"""
|
1950 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
|
2003 |
self.selection_mask_ = None
|
2004 |
self.julia_state_stream_ = None
|
2005 |
self.julia_options_stream_ = None
|
2006 |
+
self.complexity_of_variables_ = None
|
2007 |
self.X_units_ = None
|
2008 |
self.y_units_ = None
|
2009 |
|
|
|
|
|
|
|
2010 |
self._setup_equation_file()
|
2011 |
|
2012 |
+
runtime_params = self._validate_and_modify_params()
|
2013 |
|
2014 |
(
|
2015 |
X,
|
|
|
2017 |
Xresampled,
|
2018 |
weights,
|
2019 |
variable_names,
|
2020 |
+
complexity_of_variables,
|
2021 |
X_units,
|
2022 |
y_units,
|
2023 |
) = self._validate_and_set_fit_params(
|
2024 |
+
X,
|
2025 |
+
y,
|
2026 |
+
Xresampled,
|
2027 |
+
weights,
|
2028 |
+
variable_names,
|
2029 |
+
complexity_of_variables,
|
2030 |
+
X_units,
|
2031 |
+
y_units,
|
2032 |
)
|
2033 |
|
2034 |
if X.shape[0] > 10000 and not self.batching:
|
|
|
2042 |
"More datapoints will lower the search speed."
|
2043 |
)
|
2044 |
|
2045 |
+
random_state = check_random_state(self.random_state) # For np random
|
2046 |
+
seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random
|
2047 |
+
|
2048 |
# Pre transformations (feature selection and denoising)
|
2049 |
+
X, y, variable_names, complexity_of_variables, X_units, y_units = (
|
2050 |
+
self._pre_transform_training_data(
|
2051 |
+
X,
|
2052 |
+
y,
|
2053 |
+
Xresampled,
|
2054 |
+
variable_names,
|
2055 |
+
complexity_of_variables,
|
2056 |
+
X_units,
|
2057 |
+
y_units,
|
2058 |
+
random_state,
|
2059 |
+
)
|
2060 |
)
|
2061 |
|
2062 |
# Warn about large feature counts (still warn if feature count is large
|
|
|
2066 |
"Note: you are running with 10 features or more. "
|
2067 |
"Genetic algorithms like used in PySR scale poorly with large numbers of features. "
|
2068 |
"You should run PySR for more `niterations` to ensure it can find "
|
2069 |
+
"the correct variables, and consider using a larger `maxsize`."
|
|
|
|
|
|
|
|
|
|
|
|
|
2070 |
)
|
2071 |
|
2072 |
# Assertion checks
|
|
|
2077 |
X,
|
2078 |
use_custom_variable_names,
|
2079 |
variable_names,
|
2080 |
+
complexity_of_variables,
|
2081 |
weights,
|
2082 |
y,
|
2083 |
X_units,
|
|
|
2090 |
self._checkpoint()
|
2091 |
|
2092 |
# Perform the search:
|
2093 |
+
self._run(X, y, runtime_params, weights=weights, seed=seed)
|
2094 |
|
2095 |
# Then, after fit, we save again, so the pickle file contains
|
2096 |
# the equations:
|
|
|
2099 |
|
2100 |
return self
|
2101 |
|
2102 |
+
def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None:
|
2103 |
"""
|
2104 |
Update self.equations_ with any new options passed.
|
2105 |
|
|
|
2108 |
|
2109 |
Parameters
|
2110 |
----------
|
2111 |
+
checkpoint_file : str or Path
|
2112 |
Path to checkpoint hall of fame file to be loaded.
|
2113 |
The default will use the set `equation_file_`.
|
2114 |
"""
|
2115 |
+
if checkpoint_file is not None:
|
2116 |
self.equation_file_ = checkpoint_file
|
2117 |
self.equation_file_contents_ = None
|
2118 |
check_is_fitted(self, attributes=["equation_file_"])
|
|
|
2164 |
if self.selection_mask_ is not None:
|
2165 |
# RangeIndex enforces column order allowing columns to
|
2166 |
# be correctly filtered with self.selection_mask_
|
2167 |
+
X = X[X.columns[self.selection_mask_]]
|
2168 |
X.columns = self.feature_names_in_
|
2169 |
# Without feature information, CallableEquation/lambda_format equations
|
2170 |
# require that the column order of X matches that of the X used during
|
|
|
2174 |
# reordered/reindexed to match those of the transformed (denoised and
|
2175 |
# feature selected) X in fit.
|
2176 |
X = X.reindex(columns=self.feature_names_in_)
|
2177 |
+
X = self._validate_data_X(X)
|
2178 |
|
2179 |
try:
|
2180 |
+
if isinstance(best_equation, list):
|
2181 |
+
assert self.nout_ > 1
|
2182 |
return np.stack(
|
2183 |
[eq["lambda_format"](X) for eq in best_equation], axis=1
|
2184 |
)
|
2185 |
+
else:
|
2186 |
+
return best_equation["lambda_format"](X)
|
2187 |
except Exception as error:
|
2188 |
raise ValueError(
|
2189 |
"Failed to evaluate the expression. "
|
|
|
2213 |
"""
|
2214 |
self.refresh()
|
2215 |
best_equation = self.get_best(index=index)
|
2216 |
+
if isinstance(best_equation, list):
|
2217 |
+
assert self.nout_ > 1
|
2218 |
return [eq["sympy_format"] for eq in best_equation]
|
2219 |
+
else:
|
2220 |
+
return best_equation["sympy_format"]
|
2221 |
|
2222 |
def latex(self, index=None, precision=3):
|
2223 |
"""
|
|
|
2277 |
self.set_params(output_jax_format=True)
|
2278 |
self.refresh()
|
2279 |
best_equation = self.get_best(index=index)
|
2280 |
+
if isinstance(best_equation, list):
|
2281 |
+
assert self.nout_ > 1
|
2282 |
return [eq["jax_format"] for eq in best_equation]
|
2283 |
+
else:
|
2284 |
+
return best_equation["jax_format"]
|
2285 |
|
2286 |
def pytorch(self, index=None):
|
2287 |
"""
|
|
|
2309 |
self.set_params(output_torch_format=True)
|
2310 |
self.refresh()
|
2311 |
best_equation = self.get_best(index=index)
|
2312 |
+
if isinstance(best_equation, list):
|
2313 |
return [eq["torch_format"] for eq in best_equation]
|
2314 |
+
else:
|
2315 |
+
return best_equation["torch_format"]
|
2316 |
|
2317 |
def _read_equation_file(self):
|
2318 |
"""Read the hall of fame file created by `SymbolicRegression.jl`."""
|
|
|
2411 |
lastComplexity = 0
|
2412 |
sympy_format = []
|
2413 |
lambda_format = []
|
2414 |
+
jax_format = []
|
2415 |
+
torch_format = []
|
|
|
|
|
2416 |
|
2417 |
for _, eqn_row in output.iterrows():
|
2418 |
eqn = pysr2sympy(
|
|
|
2524 |
"""
|
2525 |
self.refresh()
|
2526 |
|
2527 |
+
if isinstance(self.equations_, list):
|
2528 |
if indices is not None:
|
2529 |
assert isinstance(indices, list)
|
2530 |
assert isinstance(indices[0], list)
|
|
|
2533 |
table_string = sympy2multilatextable(
|
2534 |
self.equations_, indices=indices, precision=precision, columns=columns
|
2535 |
)
|
2536 |
+
elif isinstance(self.equations_, pd.DataFrame):
|
2537 |
if indices is not None:
|
2538 |
assert isinstance(indices, list)
|
2539 |
assert isinstance(indices[0], int)
|
|
|
2541 |
table_string = sympy2latextable(
|
2542 |
self.equations_, indices=indices, precision=precision, columns=columns
|
2543 |
)
|
2544 |
+
else:
|
2545 |
+
raise ValueError(
|
2546 |
+
"Invalid type for equations_ to pass to `latex_table`. "
|
2547 |
+
"Expected a DataFrame or a list of DataFrames."
|
2548 |
+
)
|
2549 |
|
2550 |
+
return with_preamble(table_string)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2551 |
|
2552 |
|
2553 |
def idx_model_selection(equations: pd.DataFrame, model_selection: str):
|
|
|
2565 |
f"{model_selection} is not a valid model selection strategy."
|
2566 |
)
|
2567 |
return chosen_idx
|
2568 |
+
|
2569 |
+
|
2570 |
+
def _mutate_parameter(param_name: str, param_value):
|
2571 |
+
if param_name in ["binary_operators", "unary_operators"] and isinstance(
|
2572 |
+
param_value, str
|
2573 |
+
):
|
2574 |
+
return [param_value]
|
2575 |
+
|
2576 |
+
if param_name == "batch_size" and param_value < 1:
|
2577 |
+
warnings.warn(
|
2578 |
+
"Given `batch_size` must be greater than or equal to one. "
|
2579 |
+
"`batch_size` has been increased to equal one."
|
2580 |
+
)
|
2581 |
+
return 1
|
2582 |
+
|
2583 |
+
if (
|
2584 |
+
param_name == "progress"
|
2585 |
+
and param_value == True
|
2586 |
+
and "buffer" not in sys.stdout.__dir__()
|
2587 |
+
):
|
2588 |
+
warnings.warn(
|
2589 |
+
"Note: it looks like you are running in Jupyter. "
|
2590 |
+
"The progress bar will be turned off."
|
2591 |
+
)
|
2592 |
+
return False
|
2593 |
+
|
2594 |
+
return param_value
|
pysr/test/__main__.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
"""CLI for running PySR's test suite."""
|
|
|
2 |
import argparse
|
3 |
|
4 |
from . import *
|
|
|
1 |
"""CLI for running PySR's test suite."""
|
2 |
+
|
3 |
import argparse
|
4 |
|
5 |
from . import *
|
pysr/test/params.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import inspect
|
2 |
|
3 |
-
from
|
4 |
|
5 |
DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
|
6 |
DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
|
|
|
1 |
import inspect
|
2 |
|
3 |
+
from pysr import PySRRegressor
|
4 |
|
5 |
DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
|
6 |
DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
|
pysr/test/test.py
CHANGED
@@ -11,12 +11,18 @@ import pandas as pd
|
|
11 |
import sympy
|
12 |
from sklearn.utils.estimator_checks import check_estimator
|
13 |
|
14 |
-
from
|
15 |
-
from
|
16 |
-
from
|
17 |
-
from
|
18 |
-
from
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
from .params import (
|
21 |
DEFAULT_NCYCLES,
|
22 |
DEFAULT_NITERATIONS,
|
@@ -24,6 +30,11 @@ from .params import (
|
|
24 |
DEFAULT_POPULATIONS,
|
25 |
)
|
26 |
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
class TestPipeline(unittest.TestCase):
|
29 |
def setUp(self):
|
@@ -171,6 +182,63 @@ class TestPipeline(unittest.TestCase):
|
|
171 |
self.assertLessEqual(mse1, 1e-4)
|
172 |
self.assertLessEqual(mse2, 1e-4)
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
def test_multioutput_weighted_with_callable_temp_equation(self):
|
175 |
X = self.X.copy()
|
176 |
y = X[:, [0, 1]] ** 2
|
@@ -308,7 +376,10 @@ class TestPipeline(unittest.TestCase):
|
|
308 |
"unused_feature": self.rstate.randn(500),
|
309 |
}
|
310 |
)
|
311 |
-
|
|
|
|
|
|
|
312 |
y = true_fn(X)
|
313 |
noise = self.rstate.randn(500) * 0.01
|
314 |
y = y + noise
|
@@ -367,13 +438,12 @@ class TestPipeline(unittest.TestCase):
|
|
367 |
|
368 |
def test_load_model(self):
|
369 |
"""See if we can load a ran model from the equation file."""
|
370 |
-
csv_file_data = """
|
371 |
-
Complexity,Loss,Equation
|
372 |
1,0.19951081,"1.9762075"
|
373 |
3,0.12717344,"(f0 + 1.4724599)"
|
374 |
4,0.104823045,"pow_abs(2.2683423, cos(f3))\""""
|
375 |
# Strip the indents:
|
376 |
-
csv_file_data = "\n".join([
|
377 |
|
378 |
for from_backup in [False, True]:
|
379 |
rand_dir = Path(tempfile.mkdtemp())
|
@@ -425,12 +495,22 @@ class TestPipeline(unittest.TestCase):
|
|
425 |
if os.path.exists(file_to_delete):
|
426 |
os.remove(file_to_delete)
|
427 |
|
428 |
-
pickle_file = rand_dir / "equations.pkl"
|
429 |
model3 = PySRRegressor.from_file(
|
430 |
model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2}
|
431 |
)
|
432 |
np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X))
|
433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
|
435 |
def manually_create_model(equations, feature_names=None):
|
436 |
if feature_names is None:
|
@@ -526,7 +606,7 @@ class TestFeatureSelection(unittest.TestCase):
|
|
526 |
X = self.rstate.randn(20000, 5)
|
527 |
y = X[:, 2] ** 2 + X[:, 3] ** 2
|
528 |
selected = run_feature_selection(X, y, select_k_features=2)
|
529 |
-
|
530 |
|
531 |
def test_feature_selection_handler(self):
|
532 |
X = self.rstate.randn(20000, 5)
|
@@ -538,8 +618,8 @@ class TestFeatureSelection(unittest.TestCase):
|
|
538 |
variable_names=var_names,
|
539 |
y=y,
|
540 |
)
|
541 |
-
|
542 |
-
selected_var_names = [var_names[i] for i in selection]
|
543 |
self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
|
544 |
np.testing.assert_array_equal(
|
545 |
np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
|
@@ -563,6 +643,105 @@ class TestMiscellaneous(unittest.TestCase):
|
|
563 |
test_pkl_file = _csv_filename_to_pkl_filename(str(equation_file))
|
564 |
self.assertEqual(test_pkl_file, str(expected_pkl_file))
|
565 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
566 |
def test_deprecation(self):
|
567 |
"""Ensure that deprecation works as expected.
|
568 |
|
@@ -705,100 +884,28 @@ class TestMiscellaneous(unittest.TestCase):
|
|
705 |
model.get_best()
|
706 |
print("Failed", opt["kwargs"])
|
707 |
|
708 |
-
def
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
temp_equation_file=True,
|
713 |
-
procs=0,
|
714 |
-
multithreading=False,
|
715 |
)
|
716 |
-
nout = 3
|
717 |
-
X = np.random.randn(100, 2)
|
718 |
-
y = np.random.randn(100, nout)
|
719 |
-
model.fit(X, y)
|
720 |
-
contents = model.equation_file_contents_.copy()
|
721 |
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
for i in range(1, nout + 1):
|
726 |
-
assert not os.path.exists(str(equation_file_base) + f".out{i}.bkup")
|
727 |
-
|
728 |
-
with tempfile.NamedTemporaryFile() as pickle_file:
|
729 |
-
pkl.dump(model, pickle_file)
|
730 |
-
pickle_file.seek(0)
|
731 |
-
model2 = pkl.load(pickle_file)
|
732 |
-
|
733 |
-
contents2 = model2.equation_file_contents_
|
734 |
-
cols_to_check = ["equation", "loss", "complexity"]
|
735 |
-
for frame1, frame2 in zip(contents, contents2):
|
736 |
-
pd.testing.assert_frame_equal(frame1[cols_to_check], frame2[cols_to_check])
|
737 |
-
|
738 |
-
y_predictions2 = model2.predict(X)
|
739 |
-
np.testing.assert_array_equal(y_predictions, y_predictions2)
|
740 |
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
verbosity=0,
|
748 |
-
progress=False,
|
749 |
-
random_state=0,
|
750 |
-
deterministic=True, # Deterministic as tests require this.
|
751 |
-
procs=0,
|
752 |
-
multithreading=False,
|
753 |
-
warm_start=False,
|
754 |
-
temp_equation_file=True,
|
755 |
-
) # Return early.
|
756 |
-
|
757 |
-
check_generator = check_estimator(model, generate_only=True)
|
758 |
-
exception_messages = []
|
759 |
-
for _, check in check_generator:
|
760 |
-
if check.func.__name__ == "check_complex_data":
|
761 |
-
# We can use complex data, so avoid this check.
|
762 |
-
continue
|
763 |
-
try:
|
764 |
-
with warnings.catch_warnings():
|
765 |
-
warnings.simplefilter("ignore")
|
766 |
-
check(model)
|
767 |
-
print("Passed", check.func.__name__)
|
768 |
-
except Exception:
|
769 |
-
error_message = str(traceback.format_exc())
|
770 |
-
exception_messages.append(
|
771 |
-
f"{check.func.__name__}:\n" + error_message + "\n"
|
772 |
-
)
|
773 |
-
print("Failed", check.func.__name__, "with:")
|
774 |
-
# Add a leading tab to error message, which
|
775 |
-
# might be multi-line:
|
776 |
-
print("\n".join([(" " * 4) + row for row in error_message.split("\n")]))
|
777 |
-
# If any checks failed don't let the test pass.
|
778 |
-
self.assertEqual(len(exception_messages), 0)
|
779 |
-
|
780 |
-
def test_param_groupings(self):
|
781 |
-
"""Test that param_groupings are complete"""
|
782 |
-
param_groupings_file = Path(__file__).parent.parent / "param_groupings.yml"
|
783 |
-
if not param_groupings_file.exists():
|
784 |
-
return
|
785 |
-
|
786 |
-
# Read the file, discarding lines ending in ":",
|
787 |
-
# and removing leading "\s*-\s*":
|
788 |
-
params = []
|
789 |
-
with open(param_groupings_file, "r") as f:
|
790 |
-
for line in f.readlines():
|
791 |
-
if line.strip().endswith(":"):
|
792 |
-
continue
|
793 |
-
if line.strip().startswith("-"):
|
794 |
-
params.append(line.strip()[1:].strip())
|
795 |
|
796 |
-
|
797 |
-
|
798 |
-
|
799 |
|
800 |
-
|
801 |
-
self.assertSetEqual(set(params), set(regressor_params))
|
802 |
|
803 |
|
804 |
TRUE_PREAMBLE = "\n".join(
|
@@ -932,7 +1039,7 @@ class TestLaTeXTable(unittest.TestCase):
|
|
932 |
middle_part_2 = r"""
|
933 |
$y_{1} = x_{1}$ & $1$ & $1.32$ & $0.0$ \\
|
934 |
$y_{1} = \cos{\left(x_{1} \right)}$ & $2$ & $0.0520$ & $3.23$ \\
|
935 |
-
$y_{1} = x_{0}
|
936 |
"""
|
937 |
true_latex_table_str = "\n\n".join(
|
938 |
self.create_true_latex(part, include_score=True)
|
@@ -985,7 +1092,7 @@ class TestLaTeXTable(unittest.TestCase):
|
|
985 |
middle_part = r"""
|
986 |
$y = x_{0}$ & $1$ & $1.05$ & $0.0$ \\
|
987 |
$y = \cos{\left(x_{0} \right)}$ & $2$ & $0.0232$ & $3.82$ \\
|
988 |
-
\begin{minipage}{0.8\linewidth} \vspace{-1em} \begin{dmath*} y = x_{0}
|
989 |
"""
|
990 |
true_latex_table_str = (
|
991 |
TRUE_PREAMBLE
|
@@ -1039,8 +1146,14 @@ class TestDimensionalConstraints(unittest.TestCase):
|
|
1039 |
"""This just checks the number of units passed"""
|
1040 |
use_custom_variable_names = False
|
1041 |
variable_names = None
|
|
|
1042 |
weights = None
|
1043 |
-
args = (
|
|
|
|
|
|
|
|
|
|
|
1044 |
valid_units = [
|
1045 |
(np.ones((10, 2)), np.ones(10), ["m/s", "s"], "m"),
|
1046 |
(np.ones((10, 1)), np.ones(10), ["m/s"], None),
|
@@ -1148,6 +1261,7 @@ def runtests(just_tests=False):
|
|
1148 |
TestBest,
|
1149 |
TestFeatureSelection,
|
1150 |
TestMiscellaneous,
|
|
|
1151 |
TestLaTeXTable,
|
1152 |
TestDimensionalConstraints,
|
1153 |
]
|
|
|
11 |
import sympy
|
12 |
from sklearn.utils.estimator_checks import check_estimator
|
13 |
|
14 |
+
from pysr import PySRRegressor, install, jl
|
15 |
+
from pysr.export_latex import sympy2latex
|
16 |
+
from pysr.feature_selection import _handle_feature_selection, run_feature_selection
|
17 |
+
from pysr.julia_helpers import init_julia
|
18 |
+
from pysr.sr import (
|
19 |
+
_check_assertions,
|
20 |
+
_process_constraints,
|
21 |
+
_suggest_keywords,
|
22 |
+
idx_model_selection,
|
23 |
+
)
|
24 |
+
from pysr.utils import _csv_filename_to_pkl_filename
|
25 |
+
|
26 |
from .params import (
|
27 |
DEFAULT_NCYCLES,
|
28 |
DEFAULT_NITERATIONS,
|
|
|
30 |
DEFAULT_POPULATIONS,
|
31 |
)
|
32 |
|
33 |
+
# Disables local saving:
|
34 |
+
os.environ["SYMBOLIC_REGRESSION_IS_TESTING"] = os.environ.get(
|
35 |
+
"SYMBOLIC_REGRESSION_IS_TESTING", "true"
|
36 |
+
)
|
37 |
+
|
38 |
|
39 |
class TestPipeline(unittest.TestCase):
|
40 |
def setUp(self):
|
|
|
182 |
self.assertLessEqual(mse1, 1e-4)
|
183 |
self.assertLessEqual(mse2, 1e-4)
|
184 |
|
185 |
+
def test_custom_variable_complexity(self):
|
186 |
+
for outer in (True, False):
|
187 |
+
for case in (1, 2):
|
188 |
+
y = self.X[:, [0, 1]]
|
189 |
+
if case == 1:
|
190 |
+
kwargs = dict(complexity_of_variables=[2, 3])
|
191 |
+
elif case == 2:
|
192 |
+
kwargs = dict(complexity_of_variables=2)
|
193 |
+
|
194 |
+
if outer:
|
195 |
+
outer_kwargs = kwargs
|
196 |
+
inner_kwargs = dict()
|
197 |
+
else:
|
198 |
+
outer_kwargs = dict()
|
199 |
+
inner_kwargs = kwargs
|
200 |
+
|
201 |
+
model = PySRRegressor(
|
202 |
+
binary_operators=["+"],
|
203 |
+
verbosity=0,
|
204 |
+
**self.default_test_kwargs,
|
205 |
+
early_stop_condition=(
|
206 |
+
f"stop_if_{case}(l, c) = l < 1e-8 && c <= {3 if case == 1 else 2}"
|
207 |
+
),
|
208 |
+
**outer_kwargs,
|
209 |
+
)
|
210 |
+
model.fit(self.X[:, [0, 1]], y, **inner_kwargs)
|
211 |
+
self.assertLessEqual(model.get_best()[0]["loss"], 1e-8)
|
212 |
+
self.assertLessEqual(model.get_best()[1]["loss"], 1e-8)
|
213 |
+
|
214 |
+
self.assertEqual(model.get_best()[0]["complexity"], 2)
|
215 |
+
self.assertEqual(
|
216 |
+
model.get_best()[1]["complexity"], 3 if case == 1 else 2
|
217 |
+
)
|
218 |
+
|
219 |
+
def test_error_message_custom_variable_complexity(self):
|
220 |
+
X = np.ones((10, 2))
|
221 |
+
y = np.ones((10,))
|
222 |
+
model = PySRRegressor()
|
223 |
+
with self.assertRaises(ValueError) as cm:
|
224 |
+
model.fit(X, y, complexity_of_variables=[1, 2, 3])
|
225 |
+
|
226 |
+
self.assertIn(
|
227 |
+
"number of elements in `complexity_of_variables`", str(cm.exception)
|
228 |
+
)
|
229 |
+
|
230 |
+
def test_error_message_both_variable_complexity(self):
|
231 |
+
X = np.ones((10, 2))
|
232 |
+
y = np.ones((10,))
|
233 |
+
model = PySRRegressor(complexity_of_variables=[1, 2])
|
234 |
+
with self.assertRaises(ValueError) as cm:
|
235 |
+
model.fit(X, y, complexity_of_variables=[1, 2, 3])
|
236 |
+
|
237 |
+
self.assertIn(
|
238 |
+
"You cannot set `complexity_of_variables` at both `fit` and `__init__`.",
|
239 |
+
str(cm.exception),
|
240 |
+
)
|
241 |
+
|
242 |
def test_multioutput_weighted_with_callable_temp_equation(self):
|
243 |
X = self.X.copy()
|
244 |
y = X[:, [0, 1]] ** 2
|
|
|
376 |
"unused_feature": self.rstate.randn(500),
|
377 |
}
|
378 |
)
|
379 |
+
|
380 |
+
def true_fn(x):
|
381 |
+
return np.array(x["T"] + x["x"] ** 2 + 1.323837)
|
382 |
+
|
383 |
y = true_fn(X)
|
384 |
noise = self.rstate.randn(500) * 0.01
|
385 |
y = y + noise
|
|
|
438 |
|
439 |
def test_load_model(self):
|
440 |
"""See if we can load a ran model from the equation file."""
|
441 |
+
csv_file_data = """Complexity,Loss,Equation
|
|
|
442 |
1,0.19951081,"1.9762075"
|
443 |
3,0.12717344,"(f0 + 1.4724599)"
|
444 |
4,0.104823045,"pow_abs(2.2683423, cos(f3))\""""
|
445 |
# Strip the indents:
|
446 |
+
csv_file_data = "\n".join([line.strip() for line in csv_file_data.split("\n")])
|
447 |
|
448 |
for from_backup in [False, True]:
|
449 |
rand_dir = Path(tempfile.mkdtemp())
|
|
|
495 |
if os.path.exists(file_to_delete):
|
496 |
os.remove(file_to_delete)
|
497 |
|
498 |
+
# pickle_file = rand_dir / "equations.pkl"
|
499 |
model3 = PySRRegressor.from_file(
|
500 |
model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2}
|
501 |
)
|
502 |
np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X))
|
503 |
|
504 |
+
def test_jl_function_error(self):
|
505 |
+
# TODO: Move this to better class
|
506 |
+
with self.assertRaises(ValueError) as cm:
|
507 |
+
PySRRegressor(unary_operators=["1"]).fit([[1]], [1])
|
508 |
+
|
509 |
+
self.assertIn(
|
510 |
+
"When building `unary_operators`, `'1'` did not return a Julia function",
|
511 |
+
str(cm.exception),
|
512 |
+
)
|
513 |
+
|
514 |
|
515 |
def manually_create_model(equations, feature_names=None):
|
516 |
if feature_names is None:
|
|
|
606 |
X = self.rstate.randn(20000, 5)
|
607 |
y = X[:, 2] ** 2 + X[:, 3] ** 2
|
608 |
selected = run_feature_selection(X, y, select_k_features=2)
|
609 |
+
np.testing.assert_array_equal(selected, [False, False, True, True, False])
|
610 |
|
611 |
def test_feature_selection_handler(self):
|
612 |
X = self.rstate.randn(20000, 5)
|
|
|
618 |
variable_names=var_names,
|
619 |
y=y,
|
620 |
)
|
621 |
+
np.testing.assert_array_equal(selection, [False, False, True, True, False])
|
622 |
+
selected_var_names = [var_names[i] for i in range(5) if selection[i]]
|
623 |
self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
|
624 |
np.testing.assert_array_equal(
|
625 |
np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
|
|
|
643 |
test_pkl_file = _csv_filename_to_pkl_filename(str(equation_file))
|
644 |
self.assertEqual(test_pkl_file, str(expected_pkl_file))
|
645 |
|
646 |
+
def test_pickle_with_temp_equation_file(self):
|
647 |
+
"""If we have a temporary equation file, unpickle the estimator."""
|
648 |
+
model = PySRRegressor(
|
649 |
+
populations=int(1 + DEFAULT_POPULATIONS / 5),
|
650 |
+
temp_equation_file=True,
|
651 |
+
procs=0,
|
652 |
+
multithreading=False,
|
653 |
+
)
|
654 |
+
nout = 3
|
655 |
+
X = np.random.randn(100, 2)
|
656 |
+
y = np.random.randn(100, nout)
|
657 |
+
model.fit(X, y)
|
658 |
+
contents = model.equation_file_contents_.copy()
|
659 |
+
|
660 |
+
y_predictions = model.predict(X)
|
661 |
+
|
662 |
+
equation_file_base = model.equation_file_
|
663 |
+
for i in range(1, nout + 1):
|
664 |
+
assert not os.path.exists(str(equation_file_base) + f".out{i}.bkup")
|
665 |
+
|
666 |
+
with tempfile.NamedTemporaryFile() as pickle_file:
|
667 |
+
pkl.dump(model, pickle_file)
|
668 |
+
pickle_file.seek(0)
|
669 |
+
model2 = pkl.load(pickle_file)
|
670 |
+
|
671 |
+
contents2 = model2.equation_file_contents_
|
672 |
+
cols_to_check = ["equation", "loss", "complexity"]
|
673 |
+
for frame1, frame2 in zip(contents, contents2):
|
674 |
+
pd.testing.assert_frame_equal(frame1[cols_to_check], frame2[cols_to_check])
|
675 |
+
|
676 |
+
y_predictions2 = model2.predict(X)
|
677 |
+
np.testing.assert_array_almost_equal(y_predictions, y_predictions2)
|
678 |
+
|
679 |
+
def test_scikit_learn_compatibility(self):
|
680 |
+
"""Test PySRRegressor compatibility with scikit-learn."""
|
681 |
+
model = PySRRegressor(
|
682 |
+
niterations=int(1 + DEFAULT_NITERATIONS / 10),
|
683 |
+
populations=int(1 + DEFAULT_POPULATIONS / 3),
|
684 |
+
ncycles_per_iteration=int(2 + DEFAULT_NCYCLES / 10),
|
685 |
+
verbosity=0,
|
686 |
+
progress=False,
|
687 |
+
random_state=0,
|
688 |
+
deterministic=True, # Deterministic as tests require this.
|
689 |
+
procs=0,
|
690 |
+
multithreading=False,
|
691 |
+
warm_start=False,
|
692 |
+
temp_equation_file=True,
|
693 |
+
) # Return early.
|
694 |
+
|
695 |
+
check_generator = check_estimator(model, generate_only=True)
|
696 |
+
exception_messages = []
|
697 |
+
for _, check in check_generator:
|
698 |
+
if check.func.__name__ == "check_complex_data":
|
699 |
+
# We can use complex data, so avoid this check.
|
700 |
+
continue
|
701 |
+
try:
|
702 |
+
with warnings.catch_warnings():
|
703 |
+
warnings.simplefilter("ignore")
|
704 |
+
check(model)
|
705 |
+
print("Passed", check.func.__name__)
|
706 |
+
except Exception:
|
707 |
+
error_message = str(traceback.format_exc())
|
708 |
+
exception_messages.append(
|
709 |
+
f"{check.func.__name__}:\n" + error_message + "\n"
|
710 |
+
)
|
711 |
+
print("Failed", check.func.__name__, "with:")
|
712 |
+
# Add a leading tab to error message, which
|
713 |
+
# might be multi-line:
|
714 |
+
print("\n".join([(" " * 4) + row for row in error_message.split("\n")]))
|
715 |
+
# If any checks failed don't let the test pass.
|
716 |
+
self.assertEqual(len(exception_messages), 0)
|
717 |
+
|
718 |
+
def test_param_groupings(self):
|
719 |
+
"""Test that param_groupings are complete"""
|
720 |
+
param_groupings_file = Path(__file__).parent.parent / "param_groupings.yml"
|
721 |
+
if not param_groupings_file.exists():
|
722 |
+
return
|
723 |
+
|
724 |
+
# Read the file, discarding lines ending in ":",
|
725 |
+
# and removing leading "\s*-\s*":
|
726 |
+
params = []
|
727 |
+
with open(param_groupings_file, "r") as f:
|
728 |
+
for line in f.readlines():
|
729 |
+
if line.strip().endswith(":"):
|
730 |
+
continue
|
731 |
+
if line.strip().startswith("-"):
|
732 |
+
params.append(line.strip()[1:].strip())
|
733 |
+
|
734 |
+
regressor_params = [
|
735 |
+
p for p in DEFAULT_PARAMS.keys() if p not in ["self", "kwargs"]
|
736 |
+
]
|
737 |
+
|
738 |
+
# Check the sets are equal:
|
739 |
+
self.assertSetEqual(set(params), set(regressor_params))
|
740 |
+
|
741 |
+
|
742 |
+
class TestHelpMessages(unittest.TestCase):
|
743 |
+
"""Test user help messages."""
|
744 |
+
|
745 |
def test_deprecation(self):
|
746 |
"""Ensure that deprecation works as expected.
|
747 |
|
|
|
884 |
model.get_best()
|
885 |
print("Failed", opt["kwargs"])
|
886 |
|
887 |
+
def test_suggest_keywords(self):
|
888 |
+
# Easy
|
889 |
+
self.assertEqual(
|
890 |
+
_suggest_keywords(PySRRegressor, "loss_function"), ["loss_function"]
|
|
|
|
|
|
|
891 |
)
|
|
|
|
|
|
|
|
|
|
|
892 |
|
893 |
+
# More complex, and with error
|
894 |
+
with self.assertRaises(TypeError) as cm:
|
895 |
+
model = PySRRegressor(ncyclesperiterationn=5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
896 |
|
897 |
+
self.assertIn(
|
898 |
+
"`ncyclesperiterationn` is not a valid keyword", str(cm.exception)
|
899 |
+
)
|
900 |
+
self.assertIn("Did you mean", str(cm.exception))
|
901 |
+
self.assertIn("`ncycles_per_iteration`, ", str(cm.exception))
|
902 |
+
self.assertIn("`niterations`", str(cm.exception))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
903 |
|
904 |
+
# Farther matches (this might need to be changed)
|
905 |
+
with self.assertRaises(TypeError) as cm:
|
906 |
+
model = PySRRegressor(operators=["+", "-"])
|
907 |
|
908 |
+
self.assertIn("`unary_operators`, `binary_operators`", str(cm.exception))
|
|
|
909 |
|
910 |
|
911 |
TRUE_PREAMBLE = "\n".join(
|
|
|
1039 |
middle_part_2 = r"""
|
1040 |
$y_{1} = x_{1}$ & $1$ & $1.32$ & $0.0$ \\
|
1041 |
$y_{1} = \cos{\left(x_{1} \right)}$ & $2$ & $0.0520$ & $3.23$ \\
|
1042 |
+
$y_{1} = x_{0} x_{0} x_{1}$ & $5$ & $2.00 \cdot 10^{-15}$ & $10.3$ \\
|
1043 |
"""
|
1044 |
true_latex_table_str = "\n\n".join(
|
1045 |
self.create_true_latex(part, include_score=True)
|
|
|
1092 |
middle_part = r"""
|
1093 |
$y = x_{0}$ & $1$ & $1.05$ & $0.0$ \\
|
1094 |
$y = \cos{\left(x_{0} \right)}$ & $2$ & $0.0232$ & $3.82$ \\
|
1095 |
+
\begin{minipage}{0.8\linewidth} \vspace{-1em} \begin{dmath*} y = x_{0} x_{0} x_{0} + x_{0} x_{0} x_{0} x_{0} x_{0} + 3.20 x_{0} - 1.20 x_{1} + x_{1} x_{1} x_{1} + 5.20 \sin{\left(- 2.60 x_{0} + 0.326 \sin{\left(x_{2} \right)} \right)} - \cos{\left(x_{0} x_{1} \right)} + \cos{\left(x_{0} x_{0} x_{0} + 3.20 x_{0} - 1.20 x_{1} + x_{1} x_{1} x_{1} + \cos{\left(x_{0} x_{1} \right)} \right)} \end{dmath*} \end{minipage} & $30$ & $1.12 \cdot 10^{-15}$ & $1.09$ \\
|
1096 |
"""
|
1097 |
true_latex_table_str = (
|
1098 |
TRUE_PREAMBLE
|
|
|
1146 |
"""This just checks the number of units passed"""
|
1147 |
use_custom_variable_names = False
|
1148 |
variable_names = None
|
1149 |
+
complexity_of_variables = 1
|
1150 |
weights = None
|
1151 |
+
args = (
|
1152 |
+
use_custom_variable_names,
|
1153 |
+
variable_names,
|
1154 |
+
complexity_of_variables,
|
1155 |
+
weights,
|
1156 |
+
)
|
1157 |
valid_units = [
|
1158 |
(np.ones((10, 2)), np.ones(10), ["m/s", "s"], "m"),
|
1159 |
(np.ones((10, 1)), np.ones(10), ["m/s"], None),
|
|
|
1261 |
TestBest,
|
1262 |
TestFeatureSelection,
|
1263 |
TestMiscellaneous,
|
1264 |
+
TestHelpMessages,
|
1265 |
TestLaTeXTable,
|
1266 |
TestDimensionalConstraints,
|
1267 |
]
|
pysr/test/test_jax.py
CHANGED
@@ -5,27 +5,29 @@ import numpy as np
|
|
5 |
import pandas as pd
|
6 |
import sympy
|
7 |
|
8 |
-
|
|
|
9 |
|
10 |
|
11 |
class TestJAX(unittest.TestCase):
|
12 |
def setUp(self):
|
13 |
np.random.seed(0)
|
|
|
|
|
|
|
14 |
|
15 |
def test_sympy2jax(self):
|
16 |
-
from jax import numpy as jnp
|
17 |
from jax import random
|
18 |
|
19 |
x, y, z = sympy.symbols("x y z")
|
20 |
cosx = 1.0 * sympy.cos(x) + y
|
21 |
key = random.PRNGKey(0)
|
22 |
X = random.normal(key, (1000, 2))
|
23 |
-
true = 1.0 * jnp.cos(X[:, 0]) + X[:, 1]
|
24 |
f, params = sympy2jax(cosx, [x, y, z])
|
25 |
-
self.assertTrue(jnp.all(jnp.isclose(f(X, params), true)).item())
|
26 |
|
27 |
def test_pipeline_pandas(self):
|
28 |
-
from jax import numpy as jnp
|
29 |
|
30 |
X = pd.DataFrame(np.random.randn(100, 10))
|
31 |
y = np.ones(X.shape[0])
|
@@ -52,14 +54,12 @@ class TestJAX(unittest.TestCase):
|
|
52 |
jformat = model.jax()
|
53 |
|
54 |
np.testing.assert_almost_equal(
|
55 |
-
np.array(jformat["callable"](jnp.array(X), jformat["parameters"])),
|
56 |
np.square(np.cos(X.values[:, 1])), # Select feature 1
|
57 |
decimal=3,
|
58 |
)
|
59 |
|
60 |
def test_pipeline(self):
|
61 |
-
from jax import numpy as jnp
|
62 |
-
|
63 |
X = np.random.randn(100, 10)
|
64 |
y = np.ones(X.shape[0])
|
65 |
model = PySRRegressor(progress=False, max_evals=10000, output_jax_format=True)
|
@@ -81,15 +81,46 @@ class TestJAX(unittest.TestCase):
|
|
81 |
jformat = model.jax()
|
82 |
|
83 |
np.testing.assert_almost_equal(
|
84 |
-
np.array(jformat["callable"](jnp.array(X), jformat["parameters"])),
|
85 |
np.square(np.cos(X[:, 1])), # Select feature 1
|
86 |
decimal=3,
|
87 |
)
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
def test_feature_selection_custom_operators(self):
|
90 |
rstate = np.random.RandomState(0)
|
91 |
X = pd.DataFrame({f"k{i}": rstate.randn(2000) for i in range(10, 21)})
|
92 |
-
|
|
|
|
|
|
|
93 |
y = X["k15"] ** 2 + 2 * cos_approx(X["k20"])
|
94 |
|
95 |
model = PySRRegressor(
|
|
|
5 |
import pandas as pd
|
6 |
import sympy
|
7 |
|
8 |
+
import pysr
|
9 |
+
from pysr import PySRRegressor, sympy2jax
|
10 |
|
11 |
|
12 |
class TestJAX(unittest.TestCase):
|
13 |
def setUp(self):
|
14 |
np.random.seed(0)
|
15 |
+
from jax import numpy as jnp
|
16 |
+
|
17 |
+
self.jnp = jnp
|
18 |
|
19 |
def test_sympy2jax(self):
|
|
|
20 |
from jax import random
|
21 |
|
22 |
x, y, z = sympy.symbols("x y z")
|
23 |
cosx = 1.0 * sympy.cos(x) + y
|
24 |
key = random.PRNGKey(0)
|
25 |
X = random.normal(key, (1000, 2))
|
26 |
+
true = 1.0 * self.jnp.cos(X[:, 0]) + X[:, 1]
|
27 |
f, params = sympy2jax(cosx, [x, y, z])
|
28 |
+
self.assertTrue(self.jnp.all(self.jnp.isclose(f(X, params), true)).item())
|
29 |
|
30 |
def test_pipeline_pandas(self):
|
|
|
31 |
|
32 |
X = pd.DataFrame(np.random.randn(100, 10))
|
33 |
y = np.ones(X.shape[0])
|
|
|
54 |
jformat = model.jax()
|
55 |
|
56 |
np.testing.assert_almost_equal(
|
57 |
+
np.array(jformat["callable"](self.jnp.array(X), jformat["parameters"])),
|
58 |
np.square(np.cos(X.values[:, 1])), # Select feature 1
|
59 |
decimal=3,
|
60 |
)
|
61 |
|
62 |
def test_pipeline(self):
|
|
|
|
|
63 |
X = np.random.randn(100, 10)
|
64 |
y = np.ones(X.shape[0])
|
65 |
model = PySRRegressor(progress=False, max_evals=10000, output_jax_format=True)
|
|
|
81 |
jformat = model.jax()
|
82 |
|
83 |
np.testing.assert_almost_equal(
|
84 |
+
np.array(jformat["callable"](self.jnp.array(X), jformat["parameters"])),
|
85 |
np.square(np.cos(X[:, 1])), # Select feature 1
|
86 |
decimal=3,
|
87 |
)
|
88 |
|
89 |
+
def test_avoid_simplification(self):
|
90 |
+
ex = pysr.export_sympy.pysr2sympy(
|
91 |
+
"square(exp(sign(0.44796443))) + 1.5 * x1",
|
92 |
+
feature_names_in=["x1"],
|
93 |
+
extra_sympy_mappings={"square": lambda x: x**2},
|
94 |
+
)
|
95 |
+
f, params = pysr.export_jax.sympy2jax(ex, [sympy.symbols("x1")])
|
96 |
+
key = np.random.RandomState(0)
|
97 |
+
X = key.randn(10, 1)
|
98 |
+
np.testing.assert_almost_equal(
|
99 |
+
np.array(f(self.jnp.array(X), params)),
|
100 |
+
np.square(np.exp(np.sign(0.44796443))) + 1.5 * X[:, 0],
|
101 |
+
decimal=3,
|
102 |
+
)
|
103 |
+
|
104 |
+
def test_issue_656(self):
|
105 |
+
import sympy
|
106 |
+
|
107 |
+
E_plus_x1 = sympy.exp(1) + sympy.symbols("x1")
|
108 |
+
f, params = pysr.export_jax.sympy2jax(E_plus_x1, [sympy.symbols("x1")])
|
109 |
+
key = np.random.RandomState(0)
|
110 |
+
X = key.randn(10, 1)
|
111 |
+
np.testing.assert_almost_equal(
|
112 |
+
np.array(f(self.jnp.array(X), params)),
|
113 |
+
np.exp(1) + X[:, 0],
|
114 |
+
decimal=3,
|
115 |
+
)
|
116 |
+
|
117 |
def test_feature_selection_custom_operators(self):
|
118 |
rstate = np.random.RandomState(0)
|
119 |
X = pd.DataFrame({f"k{i}": rstate.randn(2000) for i in range(10, 21)})
|
120 |
+
|
121 |
+
def cos_approx(x):
|
122 |
+
return 1 - (x**2) / 2 + (x**4) / 24 + (x**6) / 720
|
123 |
+
|
124 |
y = X["k15"] ** 2 + 2 * cos_approx(X["k20"])
|
125 |
|
126 |
model = PySRRegressor(
|
pysr/test/test_startup.py
CHANGED
@@ -9,8 +9,9 @@ from pathlib import Path
|
|
9 |
|
10 |
import numpy as np
|
11 |
|
12 |
-
from
|
13 |
-
from
|
|
|
14 |
from .params import DEFAULT_NITERATIONS, DEFAULT_POPULATIONS
|
15 |
|
16 |
|
@@ -118,10 +119,6 @@ class TestStartup(unittest.TestCase):
|
|
118 |
code="import juliacall; import pysr",
|
119 |
msg="juliacall module already imported.",
|
120 |
),
|
121 |
-
dict(
|
122 |
-
code='import os; os.environ["PYSR_AUTOLOAD_EXTENSIONS"] = "foo"; import pysr',
|
123 |
-
msg="PYSR_AUTOLOAD_EXTENSIONS environment variable is set",
|
124 |
-
),
|
125 |
]
|
126 |
for warning_test in warning_tests:
|
127 |
result = subprocess.run(
|
|
|
9 |
|
10 |
import numpy as np
|
11 |
|
12 |
+
from pysr import PySRRegressor
|
13 |
+
from pysr.julia_import import jl_version
|
14 |
+
|
15 |
from .params import DEFAULT_NITERATIONS, DEFAULT_POPULATIONS
|
16 |
|
17 |
|
|
|
119 |
code="import juliacall; import pysr",
|
120 |
msg="juliacall module already imported.",
|
121 |
),
|
|
|
|
|
|
|
|
|
122 |
]
|
123 |
for warning_test in warning_tests:
|
124 |
result = subprocess.run(
|
pysr/test/test_torch.py
CHANGED
@@ -4,7 +4,8 @@ import numpy as np
|
|
4 |
import pandas as pd
|
5 |
import sympy
|
6 |
|
7 |
-
|
|
|
8 |
|
9 |
|
10 |
class TestTorch(unittest.TestCase):
|
@@ -153,10 +154,43 @@ class TestTorch(unittest.TestCase):
|
|
153 |
decimal=3,
|
154 |
)
|
155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
def test_feature_selection_custom_operators(self):
|
157 |
rstate = np.random.RandomState(0)
|
158 |
X = pd.DataFrame({f"k{i}": rstate.randn(2000) for i in range(10, 21)})
|
159 |
-
|
|
|
|
|
|
|
160 |
y = X["k15"] ** 2 + 2 * cos_approx(X["k20"])
|
161 |
|
162 |
model = PySRRegressor(
|
|
|
4 |
import pandas as pd
|
5 |
import sympy
|
6 |
|
7 |
+
import pysr
|
8 |
+
from pysr import PySRRegressor, sympy2torch
|
9 |
|
10 |
|
11 |
class TestTorch(unittest.TestCase):
|
|
|
154 |
decimal=3,
|
155 |
)
|
156 |
|
157 |
+
def test_avoid_simplification(self):
|
158 |
+
# SymPy should not simplify without permission
|
159 |
+
torch = self.torch
|
160 |
+
ex = pysr.export_sympy.pysr2sympy(
|
161 |
+
"square(exp(sign(0.44796443))) + 1.5 * x1",
|
162 |
+
# ^ Normally this would become exp1 and require
|
163 |
+
# its own mapping
|
164 |
+
feature_names_in=["x1"],
|
165 |
+
extra_sympy_mappings={"square": lambda x: x**2},
|
166 |
+
)
|
167 |
+
m = pysr.export_torch.sympy2torch(ex, ["x1"])
|
168 |
+
rng = np.random.RandomState(0)
|
169 |
+
X = rng.randn(10, 1)
|
170 |
+
np.testing.assert_almost_equal(
|
171 |
+
m(torch.tensor(X)).detach().numpy(),
|
172 |
+
np.square(np.exp(np.sign(0.44796443))) + 1.5 * X[:, 0],
|
173 |
+
decimal=3,
|
174 |
+
)
|
175 |
+
|
176 |
+
def test_issue_656(self):
|
177 |
+
# Should correctly map numeric symbols to floats
|
178 |
+
E_plus_x1 = sympy.exp(1) + sympy.symbols("x1")
|
179 |
+
m = pysr.export_torch.sympy2torch(E_plus_x1, ["x1"])
|
180 |
+
X = np.random.randn(10, 1)
|
181 |
+
np.testing.assert_almost_equal(
|
182 |
+
m(self.torch.tensor(X)).detach().numpy(),
|
183 |
+
np.exp(1) + X[:, 0],
|
184 |
+
decimal=3,
|
185 |
+
)
|
186 |
+
|
187 |
def test_feature_selection_custom_operators(self):
|
188 |
rstate = np.random.RandomState(0)
|
189 |
X = pd.DataFrame({f"k{i}": rstate.randn(2000) for i in range(10, 21)})
|
190 |
+
|
191 |
+
def cos_approx(x):
|
192 |
+
return 1 - (x**2) / 2 + (x**4) / 24 + (x**6) / 720
|
193 |
+
|
194 |
y = X["k15"] ** 2 + 2 * cos_approx(X["k20"])
|
195 |
|
196 |
model = PySRRegressor(
|
pysr/utils.py
CHANGED
@@ -1,10 +1,20 @@
|
|
|
|
|
|
1 |
import os
|
2 |
import re
|
|
|
|
|
3 |
|
4 |
-
from
|
|
|
5 |
|
|
|
6 |
|
7 |
-
|
|
|
|
|
|
|
|
|
8 |
if os.path.splitext(csv_filename)[1] == ".pkl":
|
9 |
return csv_filename
|
10 |
|
@@ -53,3 +63,13 @@ def _subscriptify(i: int) -> str:
|
|
53 |
For example, 123 -> "βββ".
|
54 |
"""
|
55 |
return "".join([chr(0x2080 + int(c)) for c in str(i)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import difflib
|
2 |
+
import inspect
|
3 |
import os
|
4 |
import re
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Any, List, TypeVar, Union
|
7 |
|
8 |
+
from numpy import ndarray
|
9 |
+
from sklearn.utils.validation import _check_feature_names_in # type: ignore
|
10 |
|
11 |
+
T = TypeVar("T", bound=Any)
|
12 |
|
13 |
+
ArrayLike = Union[ndarray, List[T]]
|
14 |
+
PathLike = Union[str, Path]
|
15 |
+
|
16 |
+
|
17 |
+
def _csv_filename_to_pkl_filename(csv_filename: PathLike) -> PathLike:
|
18 |
if os.path.splitext(csv_filename)[1] == ".pkl":
|
19 |
return csv_filename
|
20 |
|
|
|
63 |
For example, 123 -> "βββ".
|
64 |
"""
|
65 |
return "".join([chr(0x2080 + int(c)) for c in str(i)])
|
66 |
+
|
67 |
+
|
68 |
+
def _suggest_keywords(cls, k: str) -> List[str]:
|
69 |
+
valid_keywords = [
|
70 |
+
param
|
71 |
+
for param in inspect.signature(cls.__init__).parameters
|
72 |
+
if param not in ["self", "kwargs"]
|
73 |
+
]
|
74 |
+
suggestions = difflib.get_close_matches(k, valid_keywords, n=3)
|
75 |
+
return suggestions
|
requirements.txt
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
sympy>=1.0.0,<2.0.0
|
2 |
pandas>=0.21.0,<3.0.0
|
3 |
-
numpy>=1.13.0,<
|
4 |
scikit_learn>=1.0.0,<2.0.0
|
5 |
-
juliacall==0.9.
|
6 |
click>=7.0.0,<9.0.0
|
7 |
setuptools>=50.0.0
|
8 |
-
typing_extensions>=4.0.0,<5.0.0; python_version < "3.8"
|
|
|
1 |
sympy>=1.0.0,<2.0.0
|
2 |
pandas>=0.21.0,<3.0.0
|
3 |
+
numpy>=1.13.0,<3.0.0
|
4 |
scikit_learn>=1.0.0,<2.0.0
|
5 |
+
juliacall==0.9.20
|
6 |
click>=7.0.0,<9.0.0
|
7 |
setuptools>=50.0.0
|
|