diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index dcb80cb556f5ccdd8626ba6cbbaccb89cd80c392..e35f59d2051aee6a1e6f9b528ab47fcd01e13c8d 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -19,4 +19,3 @@ body: attributes: value: | Be sure to check out the [PySR forums](https://github.com/MilesCranmer/PySR/discussions) to chat with other users about PySR use-cases! - diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index cd56769bbfb755ce0fdf9c0a893bde5cc22e68bb..b863a4d8f582fc3913802d4003be14ec2b6781ec 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -32,7 +32,7 @@ jobs: julia-version: ['1.9'] python-version: ['3.10'] os: [ubuntu-latest] - + steps: - uses: actions/checkout@v3 - name: "Set up Julia" @@ -96,7 +96,7 @@ jobs: matrix: python-version: ['3.9'] os: ['ubuntu-latest'] - + steps: - uses: actions/checkout@v3 - name: "Cache conda" @@ -129,7 +129,7 @@ jobs: coveralls: name: Indicate completion to coveralls.io - needs: + needs: - test runs-on: ubuntu-latest defaults: diff --git a/.github/workflows/CI_Windows.yml b/.github/workflows/CI_Windows.yml index 73fb1f45d9a02316950fd9be55921d88c45c41e5..5ca0cce21482a050da6259f64d8cd4675bfd1eff 100644 --- a/.github/workflows/CI_Windows.yml +++ b/.github/workflows/CI_Windows.yml @@ -32,7 +32,7 @@ jobs: julia-version: ['1.9'] python-version: ['3.10'] os: [windows-latest] - + steps: - uses: actions/checkout@v3 - name: "Set up Julia" diff --git a/.github/workflows/CI_conda_forge.yml b/.github/workflows/CI_conda_forge.yml index 9fbb4be70b973d30b42a61aa935501bc36043e2a..51c70f9a6b63ab5af3573ea4cd3002ef183233a5 100644 --- a/.github/workflows/CI_conda_forge.yml +++ b/.github/workflows/CI_conda_forge.yml @@ -23,7 +23,7 @@ jobs: python-version: ['3.8', '3.9', '3.10', '3.11'] os: ['ubuntu-latest', 'macos-latest'] use-mamba: [true, false] - + steps: - name: "Set up Conda" uses: conda-incubator/setup-miniconda@v2 diff --git a/.github/workflows/CI_docker_large_nightly.yml b/.github/workflows/CI_docker_large_nightly.yml index c9a7a8063d764adb99acd4a45491739cc839cecc..7c8ac50808cc808e4d41e89dd067b8ebc6615416 100644 --- a/.github/workflows/CI_docker_large_nightly.yml +++ b/.github/workflows/CI_docker_large_nightly.yml @@ -22,8 +22,8 @@ jobs: python-version: ['3.10'] os: [ubuntu-latest] arch: ['linux/amd64', 'linux/arm64'] - - + + steps: - uses: actions/checkout@v3 - name: Set up QEMU diff --git a/.github/workflows/CI_large_nightly.yml b/.github/workflows/CI_large_nightly.yml index eddfec137b25d6b47dd5e66b76f7961f23dd3bd5..950072b5e53df9d62c586b7208cb8ce097daaf1e 100644 --- a/.github/workflows/CI_large_nightly.yml +++ b/.github/workflows/CI_large_nightly.yml @@ -26,7 +26,7 @@ jobs: julia-version: ['1.6', '1.8', '1.9'] python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] os: [ubuntu-latest, macos-latest, windows-latest] - + steps: - uses: actions/checkout@v3 - name: "Set up Julia" diff --git a/.github/workflows/CI_mac.yml b/.github/workflows/CI_mac.yml index 783f1b3c84c6c114f9d79915521002be4e5f491b..6edb13ca27e8e844151ec0d276b50637bc9b943d 100644 --- a/.github/workflows/CI_mac.yml +++ b/.github/workflows/CI_mac.yml @@ -32,7 +32,7 @@ jobs: julia-version: ['1.9'] python-version: ['3.10'] os: [macos-latest] - + steps: - uses: actions/checkout@v3 - name: "Set up Julia" diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index b0be5146710e79af9f59428ff6e3e8dd5c9ed26c..5605428d8da7fd30f49583ae675cd1a0b65639d2 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -37,11 +37,11 @@ jobs: # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. - + # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs # queries: security-extended,security-and-quality - + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild @@ -50,7 +50,7 @@ jobs: # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun - # If the Autobuild fails above, remove it and uncomment the following three lines. + # If the Autobuild fails above, remove it and uncomment the following three lines. # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. # - run: | diff --git a/.github/workflows/docker_deploy.yml b/.github/workflows/docker_deploy.yml index 14d6bd85d1af418b8a7b42dd47b67c54f93cc710..c7bc0e6a60e6ebb75ee6665cb831b0b88257b5ec 100644 --- a/.github/workflows/docker_deploy.yml +++ b/.github/workflows/docker_deploy.yml @@ -9,7 +9,7 @@ on: tags: - "v*.*.*" workflow_dispatch: - + jobs: docker: diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 42c5975a5026c14a2fa6ad8fc7a74c09dc07065d..2abe1477964bd9ee84f70129d747395d4dc26111 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -18,7 +18,7 @@ jobs: defaults: run: shell: bash - + steps: - uses: actions/checkout@v3 - name: "Set up Python" @@ -33,4 +33,4 @@ jobs: - name: "Build API docs" run: cd docs && ./gen_docs.sh - name: "Deploy documentation" - run: mkdocs gh-deploy --force \ No newline at end of file + run: mkdocs gh-deploy --force diff --git a/.github/workflows/update_backend.yml b/.github/workflows/update_backend.yml index a1ab23bef5fe4f56ddb8536336ee49bfc45a915e..3b47b700ae28418fa5fd3e7af14a58974ad63d8b 100644 --- a/.github/workflows/update_backend.yml +++ b/.github/workflows/update_backend.yml @@ -48,7 +48,7 @@ jobs: CURRENT_PYSR_PATCH_VERSION=$(python -c 'import pysr; print(pysr.version.__version__.split(".")[-1], end="")' 2>/dev/null) NEW_PYSR_PATCH_VERSION=$((CURRENT_PYSR_PATCH_VERSION + 1)) sed -i "s/^__version__ = .*/__version__ = \"$(python -c 'import pysr; print(".".join(pysr.version.__version__.split(".")[:-1]), end="")' 2>/dev/null).${NEW_PYSR_PATCH_VERSION}\"/" pysr/version.py - + # Set SymbolicRegression.jl version: sed -i "s/^__symbolic_regression_jl_version__ = .*/__symbolic_regression_jl_version__ = \"${{ steps.get-latest.outputs.version }}\"/" pysr/version.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d1c0ed857745c86b307010bc945ba4cbfc731a0c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,31 @@ +repos: + # General linting + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.2.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + # General formatting + - repo: https://github.com/psf/black + rev: 23.3.0 + hooks: + - id: black + - id: black-jupyter + # Stripping notebooks + - repo: https://github.com/kynan/nbstripout + rev: 0.6.1 + hooks: + - id: nbstripout + # Unused imports + - repo: https://github.com/hadialqattan/pycln + rev: "v2.2.2" + hooks: + - id: pycln + # Sorted imports + - repo: https://github.com/PyCQA/isort + rev: "5.12.0" + hooks: + - id: isort + additional_dependencies: [toml] diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 5c7bcfc0b656e7255b1b2404ffbcb2e606123654..f1695ea758c184e7146c95eb10ef7e2657187d74 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -121,4 +121,4 @@ Thanks for being part of the PySR community! - \ No newline at end of file + diff --git a/README.md b/README.md index aa65563af6a1e2c98e2a582ccdd51281af071cc2..e7db9d506b6441d99665bc558cfd4a23a335c4da 100644 --- a/README.md +++ b/README.md @@ -155,7 +155,7 @@ The PySR build in conda includes all required dependencies, so you can install i conda install -c conda-forge pysr ``` -from within your target conda environment. +from within your target conda environment. However, note that the conda install does not support precompilation of Julia libraries, so the start time may be slightly slower as the JIT-compilation will be running. @@ -305,7 +305,7 @@ model = PySRRegressor( # ^ 2 populations per core, so one is always running. population_size=50, # ^ Slightly larger populations, for greater diversity. - ncyclesperiteration=500, + ncyclesperiteration=500, # ^ Generations between migrations. niterations=10000000, # Run forever early_stop_condition=( diff --git a/benchmarks/README.md b/benchmarks/README.md index 3cedfd4f3094873e57e8abd19ef8ee09a7b8715c..b8a5221c064aba5a16f75ace1ae86422b5ea5ea3 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -21,7 +21,7 @@ v0.3.6 | 25900 v0.3.7 | 26600 v0.3.8 | 7470 v0.3.9 | 6760 -v0.3.10 | +v0.3.10 | v0.3.11 | 19500 v0.3.12 | 19000 v0.3.13 | 15200 diff --git a/benchmarks/hyperparamopt.py b/benchmarks/hyperparamopt.py index 3c933f89e647a75c195052bb7f2ca9f10923666b..e6f4b1cf03c41aa2325d7b67ba1b3316741ac481 100644 --- a/benchmarks/hyperparamopt.py +++ b/benchmarks/hyperparamopt.py @@ -1,13 +1,15 @@ """Start a hyperoptimization from a single node""" -import sys -import numpy as np import pickle as pkl -from pysr import PySRRegressor +import sys + import hyperopt -from hyperopt import hp, fmin, tpe, Trials +import numpy as np +from hyperopt import Trials, fmin, hp, tpe from hyperopt.fmin import generate_trials_to_calculate from space import * +from pysr import PySRRegressor + # Change the following code to your file ################################################################################ TRIALS_FOLDER = "trials2" diff --git a/benchmarks/print_best_model.py b/benchmarks/print_best_model.py index 880a4da34a0c0c16a226a7b52b8fb74091e7a7ed..3dd9a6b1a39bc14ab45c66249cd60b4f74d1b011 100644 --- a/benchmarks/print_best_model.py +++ b/benchmarks/print_best_model.py @@ -1,12 +1,11 @@ """Print the best model parameters and loss""" -import sys -import numpy as np import pickle as pkl -import hyperopt -from hyperopt import hp, fmin, tpe, Trials -from space import space from pprint import PrettyPrinter +import hyperopt +import numpy as np +from hyperopt import Trials, fmin, hp, tpe +from space import space # Change the following code to your file ################################################################################ @@ -51,7 +50,6 @@ import glob path = TRIALS_FOLDER + "/*.pkl" files = 0 for fname in glob.glob(path): - trials_obj = pkl.load(open(fname, "rb")) n_trials = trials_obj["n"] trials_obj = trials_obj["trials"] diff --git a/benchmarks/space.py b/benchmarks/space.py index 284addd7237441b437160815874cd0ed63068e2a..5d6a2e4eba792c3ee03fae1e6d0dcf67a1edf022 100644 --- a/benchmarks/space.py +++ b/benchmarks/space.py @@ -1,5 +1,5 @@ import numpy as np -from hyperopt import hp, fmin, tpe, Trials +from hyperopt import Trials, fmin, hp, tpe binary_operators = ["*", "/", "+", "-"] unary_operators = ["sin", "cos", "exp", "log"] diff --git a/datasets/FeynmanEquations.csv b/datasets/FeynmanEquations.csv index bd321d81ab4b586a74fc20942fd9e909a43fed09..bd80cfba0b4bf59d6b99bb98fbc6445101f6ce48 100644 --- a/datasets/FeynmanEquations.csv +++ b/datasets/FeynmanEquations.csv @@ -98,4 +98,4 @@ III.15.14,10,96,m,(h/(2*pi))**2/(2*E_n*d**2),3,h,1,5,E_n,1,5,d,1,5,,,,,,,,,,,,,, III.15.27,10,97,k,2*pi*alpha/(n*d),3,alpha,1,5,n,1,5,d,1,5,,,,,,,,,,,,,,,,,,,,, III.17.37,10,98,f,beta*(1+alpha*cos(theta)),3,beta,1,5,alpha,1,5,theta,1,5,,,,,,,,,,,,,,,,,,,,, III.19.51,10,99,E_n,-m*q**4/(2*(4*pi*epsilon)**2*(h/(2*pi))**2)*(1/n**2),5,m,1,5,q,1,5,h,1,5,n,1,5,epsilon,1,5,,,,,,,,,,,,,,, -III.21.20,10,100,j,-rho_c_0*q*A_vec/m,4,rho_c_0,1,5,q,1,5,A_vec,1,5,m,1,5,,,,,,,,,,,,,,,,,, \ No newline at end of file +III.21.20,10,100,j,-rho_c_0*q*A_vec/m,4,rho_c_0,1,5,q,1,5,A_vec,1,5,m,1,5,,,,,,,,,,,,,,,,,, diff --git a/docs/.gitignore b/docs/.gitignore index e59a8b16fb719525535a0566758a29f631dac10a..43130b842729f45d07d8a18a48eb0bc472ecbe3c 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,4 +1,4 @@ build api.md index.md.bak -papers.md \ No newline at end of file +papers.md diff --git a/docs/_api.md b/docs/_api.md index 2cd838e14ce86fcffa0e0a7d8366f82087ede9d9..bbc6c1f2f2a1f5c3e033adf765bec3fbe77fa058 100644 --- a/docs/_api.md +++ b/docs/_api.md @@ -6,7 +6,7 @@ Let's look at them below. PARAMSKEY ## PySRRegressor Functions - + ::: pysr.PySRRegressor.fit options: show_root_heading: true @@ -60,5 +60,3 @@ PARAMSKEY show_root_heading: true heading_level: 3 show_root_full_path: false - - diff --git a/docs/assets/pysr_logo.svg b/docs/assets/pysr_logo.svg index 0b8e8fa7cb6570ce7b1c555bab6f3c6716d190b4..7de145bd8aaf376ef624abdc6d66e2e215d8e05c 100644 --- a/docs/assets/pysr_logo.svg +++ b/docs/assets/pysr_logo.svg @@ -1 +1 @@ -PySR \ No newline at end of file +PySR diff --git a/docs/assets/pysr_logo_reduced.svg b/docs/assets/pysr_logo_reduced.svg index da33798c7079f69a2f4ca55c848314b59e7cd2c4..162d20c2a1642634483c64523dcdae4c5f6d68cd 100644 --- a/docs/assets/pysr_logo_reduced.svg +++ b/docs/assets/pysr_logo_reduced.svg @@ -12,7 +12,7 @@ .st6{letter-spacing:1;} - + diff --git a/docs/backend.md b/docs/backend.md index 7bf1d7dc5fb9d122b0b61c8e57c06126ef5d732a..0c7afdbf80285e4f57ad6f2dc2d0fc36a8ea2a70 100644 --- a/docs/backend.md +++ b/docs/backend.md @@ -12,7 +12,7 @@ Generally you can do this as follows: git clone https://github.com/MilesCranmer/SymbolicRegression.jl ``` 2. Edit the source code in `src/` to your requirements: - - The documentation for the backend is given [here](https://astroautomata.com/SymbolicRegression.jl/dev/). + - The documentation for the backend is given [here](https://astroautomata.com/SymbolicRegression.jl/dev/). - Throughout the package, you will often see template functions which typically use a symbol `T` (such as in the string `where {T<:Real}`). Here, `T` is simply the datatype of the input data and stored constants, such as `Float32` or `Float64`. Writing functions in this way lets us write functions generic to types, while still having access to the specific type specified at compilation time. - Expressions are stored as binary trees, using the `Node{T}` type, described [here](https://astroautomata.com/SymbolicRegression.jl/dev/types/#SymbolicRegression.CoreModule.EquationModule.Node). - Parts of the code which are typically edited by users include: @@ -26,4 +26,4 @@ git clone https://github.com/MilesCranmer/SymbolicRegression.jl If you get comfortable enough with the backend, you might consider using the Julia package directly: the API is given on the [SymbolicRegression.jl documentation](https://astroautomata.com/SymbolicRegression.jl/dev/). -If you make a change that you think could be useful to other users, don't hesitate to open a pull request on either the PySR or SymbolicRegression.jl repositories! Contributions are very appreciated. \ No newline at end of file +If you make a change that you think could be useful to other users, don't hesitate to open a pull request on either the PySR or SymbolicRegression.jl repositories! Contributions are very appreciated. diff --git a/docs/gen_param_docs.py b/docs/gen_param_docs.py index 6016c584b09ec18dc7d96be981efdfa6a5de4aed..8b12ab9bff7b56c827aadb5c7069906651a921f6 100644 --- a/docs/gen_param_docs.py +++ b/docs/gen_param_docs.py @@ -1,13 +1,14 @@ # Load YAML file param_groupings.yml: -from pathlib import Path -from yaml import safe_load +import re import sys +from docstring_parser import parse +from yaml import safe_load + sys.path.append("..") + + from pysr import PySRRegressor -import pysr -import re -from docstring_parser import parse found_params = [] diff --git a/docs/generate_papers.py b/docs/generate_papers.py index 2ac1d90b3b80f8f2f87ff0be85995fa4410250ac..1ac82fc178d319a108d873000fdb46e20ea9cee5 100644 --- a/docs/generate_papers.py +++ b/docs/generate_papers.py @@ -1,7 +1,8 @@ """This script generates the papers.md file from the papers.yml file.""" -import yaml from pathlib import Path +import yaml + data_file = "papers.yml" papers_header = Path("stylesheets") / "papers_header.txt" output_file = "papers.md" @@ -49,7 +50,7 @@ with open(output_file, "w") as f:
{authors} - + {affiliations}
diff --git a/docs/operators.md b/docs/operators.md index 2b0784b39281f635bee64f7eadd98e032bd3c06f..1d37f36728acd02498a0e67e7507c5d2c7dc406e 100644 --- a/docs/operators.md +++ b/docs/operators.md @@ -64,5 +64,3 @@ instead of `1.5e3`, if you write any constant numbers. Your operator should work with the entire real line (you can use abs(x) for operators requiring positive input - see `log_abs`); otherwise the search code will experience domain errors. - - diff --git a/docs/options.md b/docs/options.md index 2320d3433e1c690ab25bd0a2793136bbc6e80678..5eee94d8e7c55566c7a7edab2f44c3ea2086c2d8 100644 --- a/docs/options.md +++ b/docs/options.md @@ -265,7 +265,7 @@ PySRRegressor(..., loss="loss(x, y) = abs(x * y)") With weights: ```python -model = PySRRegressor(..., loss="myloss(x, y, w) = w * abs(x - y)") +model = PySRRegressor(..., loss="myloss(x, y, w) = w * abs(x - y)") model.fit(..., weights=weights) ``` diff --git a/docs/papers.yml b/docs/papers.yml index 0ee2780a9124827e5f0bc40c82962c40fca79731..94d3b228c0e018ee9abe1628be76809cdaa7ce1f 100644 --- a/docs/papers.yml +++ b/docs/papers.yml @@ -151,7 +151,6 @@ papers: abstract: "We present an approach for using machine learning to automatically discover the governing equations and hidden properties of real physical systems from observations. We train a \"graph neural network\" to simulate the dynamics of our solar system's Sun, planets, and large moons from 30 years of trajectory data. We then use symbolic regression to discover an analytical expression for the force law implicitly learned by the neural network, which our results showed is equivalent to Newton's law of gravitation. The key assumptions that were required were translational and rotational equivariance, and Newton's second and third laws of motion. Our approach correctly discovered the form of the symbolic force law. Furthermore, our approach did not require any assumptions about the masses of planets and moons or physical constants. They, too, were accurately inferred through our methods. Though, of course, the classical law of gravitation has been known since Isaac Newton, our result serves as a validation that our method can discover unknown laws and hidden properties from observed data. More broadly this work represents a key step toward realizing the potential of machine learning for accelerating scientific discovery." image: rediscovering_gravity.png date: 2022-02-04 - link: https://arxiv.org/abs/2202.02306 - title: (Thesis) On Neural Differential Equations - Section 6.1 authors: - Patrick Kidger (1) diff --git a/docs/requirements.txt b/docs/requirements.txt index 29381320e63368d79e5ef1fbf642e8dd2a80dd88..064f3d54314ff9fb413b30c3c34bc33523f34e6d 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ mkdocs-material mkdocs-autorefs mkdocstrings[python] -docstring_parser \ No newline at end of file +docstring_parser diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index 7476727237c6dcb23d1ac6a564172124fd5bd797..8db9047e24c5ac76e166cb26cc96ea9c8a7d7740 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -2,4 +2,4 @@ --md-primary-fg-color: #C13245; --md-primary-fg-color--light: #D35364; --md-primary-fg-color--dark: #982736; -} \ No newline at end of file +} diff --git a/docs/stylesheets/papers_header.txt b/docs/stylesheets/papers_header.txt index 09e0083423304fc585238bbb663861e2ef479dd1..e1e908a36a309ea383386a7350fd998a722ab084 100644 --- a/docs/stylesheets/papers_header.txt +++ b/docs/stylesheets/papers_header.txt @@ -6,4 +6,3 @@ These are sorted by the date of release, with most recent papers at the top. If you have used PySR in your research, please submit a pull request to add your paper to [this file](https://github.com/MilesCranmer/PySR/blob/master/docs/papers.yml). - diff --git a/docs/tuning.md b/docs/tuning.md index 34ac682f7413a3ff80865e2bcd425e71877befc9..954d7e5f8aafb8a398c3c79ba5aaa322a60e14a7 100644 --- a/docs/tuning.md +++ b/docs/tuning.md @@ -17,7 +17,7 @@ I run from IPython (Jupyter Notebooks don't work as well[^1]) on the head node o 5. Set `ncyclesperiteration` to maybe `5000` or so, until the head node occupation is under `10%`. 6. Set `constraints` and `nested_constraints` as strict as possible. These can help quite a bit with exploration. Typically, if I am using `pow`, I would set `constraints={"pow": (9, 1)}`, so that power laws can only have a variable or constant as their exponent. If I am using `sin` and `cos`, I also like to set `nested_constraints={"sin": {"sin": 0, "cos": 0}, "cos": {"sin": 0, "cos": 0}}`, so that sin and cos can't be nested, which seems to happen frequently. (Although in practice I would just use `sin`, since the search could always add a phase offset!) 7. Set `maxsize` a bit larger than the final size you want. e.g., if you want a final equation of size `30`, you might set this to `35`, so that it has a bit of room to explore. -8. Set `maxdepth` strictly, but leave a bit of room for exploration. e.g., if you want a final equation limited to a depth of `5`, you might set this to `6` or `7`, so that it has a bit of room to explore. +8. Set `maxdepth` strictly, but leave a bit of room for exploration. e.g., if you want a final equation limited to a depth of `5`, you might set this to `6` or `7`, so that it has a bit of room to explore. 9. Set `parsimony` equal to about the minimum loss you would expect, divided by 5-10. e.g., if you expect the final equation to have a loss of `0.001`, you might set `parsimony=0.0001`. 10. Set `weight_optimize` to some larger value, maybe `0.001`. This is very important if `ncyclesperiteration` is large, so that optimization happens more frequently. 11. Set `turbo` to `True`. This may or not work, if there's an error just turn it off (some operators are not SIMD-capable). If it does work, it should give you a nice 20% speedup. @@ -31,7 +31,7 @@ Some things I try out to see if they help: 2. Try setting `adaptive_parsimony_scaling` a bit larger, maybe up to `1000`. 3. Sometimes I try using `warmup_maxsize_by`. This is useful if you find that the search finds a very complex equation very quickly, and then gets stuck. It basically forces it to start at the simpler equations and build up complexity slowly. 4. Play around with different losses: - - I typically try `L2DistLoss()` and `L1DistLoss()`. L1 loss is more robust to outliers compared to L2 (L1 finds the median, while L2 finds the mean of a random variable), so is often a good choice for a noisy dataset. + - I typically try `L2DistLoss()` and `L1DistLoss()`. L1 loss is more robust to outliers compared to L2 (L1 finds the median, while L2 finds the mean of a random variable), so is often a good choice for a noisy dataset. - I might also provide the `weights` parameter to `fit` if there is some reasonable choice of weighting. For example, maybe I know the signal-to-noise of a particular row of `y` - I would set that SNR equal to the weights. Or, perhaps I do some sort of importance sampling, and weight the rows by importance. Very rarely I might also try tuning the mutation weights, the crossover probability, or the optimization parameters. I never use `denoise` or `select_k_features` as I find they aren't very useful. diff --git a/environment.yml b/environment.yml index 62b6c9208f4b3d5e8b90e4c1adfa3dd0e3d28a97..9eea93761baad4a8bd448dca2e06c7ddf1b2c653 100644 --- a/environment.yml +++ b/environment.yml @@ -10,4 +10,4 @@ dependencies: - pyjulia - openlibm - openspecfun - - click \ No newline at end of file + - click diff --git a/examples/pysr_demo.ipynb b/examples/pysr_demo.ipynb index b784bbfd1af21d4e2143c84a0dcce8b1397a051e..31d475985155eb461c12acd49e4b274d6ec25caf 100644 --- a/examples/pysr_demo.ipynb +++ b/examples/pysr_demo.ipynb @@ -1,1465 +1,1474 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "DS4E1PagbDgL" - }, - "source": [ - "# Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tQ1r1bbb0yBv" - }, - "source": [ - "\n", - "## Instructions\n", - "1. Work on a copy of this notebook: _File_ > _Save a copy in Drive_ (you will need a Google account).\n", - "2. (Optional) If you would like to do the deep learning component of this tutorial, turn on the GPU with Edit->Notebook settings->Hardware accelerator->GPU\n", - "3. Execute the following cell (click on it and press Ctrl+Enter) to install Julia, IJulia and other packages (if needed, update `JULIA_VERSION` and the other parameters). This takes a couple of minutes.\n", - "4. Continue to the next section.\n", - "\n", - "_Notes_:\n", - "* If your Colab Runtime gets reset (e.g., due to inactivity), repeat steps 3, 4.\n", - "* After installation, if you want to change the Julia version or activate/deactivate the GPU, you will need to reset the Runtime: _Runtime_ > _Delete and disconnect runtime_ and repeat steps 2-4." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "COndi88gbDgO" - }, - "source": [ - "**Run the following code to install Julia**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GIeFXS0F0zww" - }, - "outputs": [], - "source": [ - "%%shell\n", - "set -e\n", - "\n", - "#---------------------------------------------------#\n", - "JULIA_VERSION=\"1.8.5\"\n", - "export JULIA_PKG_PRECOMPILE_AUTO=0\n", - "#---------------------------------------------------#\n", - "\n", - "if [ -z `which julia` ]; then\n", - " # Install Julia\n", - " JULIA_VER=`cut -d '.' -f -2 <<< \"$JULIA_VERSION\"`\n", - " echo \"Installing Julia $JULIA_VERSION on the current Colab Runtime...\"\n", - " BASE_URL=\"https://julialang-s3.julialang.org/bin/linux/x64\"\n", - " URL=\"$BASE_URL/$JULIA_VER/julia-$JULIA_VERSION-linux-x86_64.tar.gz\"\n", - " wget -nv $URL -O /tmp/julia.tar.gz # -nv means \"not verbose\"\n", - " tar -x -f /tmp/julia.tar.gz -C /usr/local --strip-components 1\n", - " rm /tmp/julia.tar.gz\n", - "\n", - " echo \"Installing PyCall.jl...\"\n", - " julia -e 'using Pkg; Pkg.add(\"PyCall\"); Pkg.build(\"PyCall\")'\n", - " julia -e 'println(\"Success\")'\n", - "\n", - "fi" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ORv1c6xvbDgV" - }, - "source": [ - "Install PySR and PyTorch-Lightning:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EhMRSZEYFPLz" - }, - "outputs": [], - "source": [ - "%pip install -Uq pysr pytorch_lightning" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "etTMEV0wDqld" - }, - "source": [ - "The following step is not normally required, but colab's printing is non-standard and we need to manually set it up PyJulia:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "j666aOI8xWF_" - }, - "outputs": [], - "source": [ - "from julia import Julia\n", - "\n", - "julia = Julia(compiled_modules=False, threads='auto')\n", - "from julia import Main\n", - "from julia.tools import redirect_output_streams\n", - "\n", - "redirect_output_streams()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u2WhbVhht-G" - }, - "source": [ - "Let's install the backend of PySR, and all required libraries.\n", - "\n", - "**(This may take some time)**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "J-0QbxyK1_51" - }, - "outputs": [], - "source": [ - "import pysr\n", - "\n", - "# We don't precompile in colab because compiled modules are incompatible static Python libraries:\n", - "pysr.install(precompile=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vFpyRxmhFqeH" - }, - "outputs": [], - "source": [ - "import sympy\n", - "import numpy as np\n", - "from matplotlib import pyplot as plt\n", - "from pysr import PySRRegressor\n", - "from sklearn.model_selection import train_test_split" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gsRMQ7grbDga" - }, - "source": [ - "# Simple PySR example:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "myTEwdiUFiGL" - }, - "source": [ - "First, let's learn a simple function\n", - "\n", - "$$2.5382 \\cos(x3) + x0^2 - 2$$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Cb1eb2XuFQh8" - }, - "outputs": [], - "source": [ - "# Dataset\n", - "np.random.seed(0)\n", - "X = 2 * np.random.randn(100, 5)\n", - "y = 2.5382 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 2" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cturCkaVjzLs" - }, - "source": [ - "By default, we will set up 30 populations of expressions (which evolve independently except for migrations), use 4 threads, and use `\"best\"` for our model selection strategy:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4nDAAnisdhTc" - }, - "outputs": [], - "source": [ - "default_pysr_params = dict(\n", - " populations=30,\n", - " model_selection=\"best\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "N4gANfkaj8ie" - }, - "source": [ - "PySR can run for arbitrarily long, and continue to find more and more accurate expressions. You can set the total number of cycles of evolution with `niterations`, although there are also a [few more ways](https://github.com/MilesCranmer/PySR/pull/134) to stop execution.\n", - "\n", - "**This first execution will take a bit longer to startup, as the library is JIT-compiled. The next execution will be much faster.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "p4PSrO-NK1Wa" - }, - "outputs": [], - "source": [ - "# Learn equations\n", - "model = PySRRegressor(\n", - " niterations=30,\n", - " binary_operators=[\"plus\", \"mult\"],\n", - " unary_operators=[\"cos\", \"exp\", \"sin\"],\n", - " **default_pysr_params\n", - ")\n", - "\n", - "model.fit(X, y)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-bsAECbdkQsQ" - }, - "source": [ - "We can print the model, which will print out all the discovered expressions:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4HR8gknlZz4W" - }, - "outputs": [], - "source": [ - "model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ME3ddPxXkWQg" - }, - "source": [ - "We can also view the SymPy format of the best expression:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "IQKOohdpztS7" - }, - "outputs": [], - "source": [ - "model.sympy()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EHIIPlmClltn" - }, - "source": [ - "We can also view the SymPy of any other expression in the list, using the index of it in `model.equations_`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GRcxq-TTlpRX" - }, - "outputs": [], - "source": [ - "model.sympy(2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YMugcGX4tbqj" - }, - "source": [ - "## Output" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gIWt5wz5cjXE" - }, - "source": [ - "`model.equations_` is a Pandas DataFrame. We can export the results in various ways:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HFGaNL6tbDgi" - }, - "outputs": [], - "source": [ - "model.latex()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4hS8kqutcmPQ" - }, - "source": [ - "These is also `model.sympy(), model.jax(), model.pytorch()`. All of these can take an index as input, to get the result for an arbitrary equation in the list.\n", - "\n", - "We can also use `model.predict` for arbitrary equations, with the default equation being the one chosen by `model_selection`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Vbz4IMsk2NYH" - }, - "outputs": [], - "source": [ - "ypredict = model.predict(X)\n", - "ypredict_simpler = model.predict(X, 2)\n", - "\n", - "print(\"Default selection MSE:\", np.power(ypredict - y, 2).mean())\n", - "print(\"Manual selection MSE for index 2:\", np.power(ypredict_simpler - y, 2).mean())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SQDUScGebDgr" - }, - "source": [ - "# Custom operators" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qvgVbOoSFtQY" - }, - "source": [ - "A full list of operators is given here: https://astroautomata.com/PySR/operators,\n", - "but we can also use any binary or unary operator in `julia`, or define our own as arbitrary functions.\n", - "\n", - "Say that we want a command to do quartic powers:\n", - "\n", - "$$ y = x_0^4 - 2 $$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JvXOVqSyFsdr" - }, - "outputs": [], - "source": [ - "y = X[:, 0] ** 4 - 2" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-zoqaL8KGSK5" - }, - "source": [ - "We can do this by passing a string in Julia syntax.\n", - "\n", - "We also define the operator in sympy, with `extra_sympy_mappings`, to enable its use in `predict`, and other export functions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PoEkpvYuGUdy" - }, - "outputs": [], - "source": [ - "model = PySRRegressor(\n", - " niterations=5,\n", - " populations=40,\n", - " binary_operators=[\"plus\", \"mult\"],\n", - " unary_operators=[\"cos\", \"exp\", \"sin\", \"quart(x) = x^4\"],\n", - " extra_sympy_mappings={\"quart\": lambda x: x**4},\n", - ")\n", - "model.fit(X, y)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "emn2IajKbDgy" - }, - "outputs": [], - "source": [ - "model.sympy()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wbWHyOjl2_kX" - }, - "source": [ - "Since `quart` is arguably more complex than the other operators, you can also give it a different complexity, using, e.g., `complexity_of_operators={\"quart\": 2}` to give it a complexity of 2 (instead of the default 2). You can also define custom complexities for variables and constants (`complexity_of_variables` and `complexity_of_constants`, respectively - both take a single number).\n", - "\n", - "\n", - "One can also add a binary operator, with, e.g., `\"myoperator(x, y) = x^2 * y\"`. All Julia operators that work on scalar 32-bit floating point values are available.\n", - "\n", - "Make sure that any operator you add is valid over the real line. So, e.g., you will need to define `\"mysqrt(x) = sqrt(abs(x))\"` to enable it for negative numbers,\n", - "or, simply have it return a very large number for bad inputs (to prevent negative input in a soft way):\n", - "`\"mysqrt(x::T) where {T} = (x >= 0) ? x : T(-1e9)\"` (Julia syntax for a template function of input type `T`), which will make `mysqrt(x)` return -10^9 for negative x–hurting the loss of the equation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pEXT4xskbDg0" - }, - "source": [ - "## Scoring" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IyeYbVVOG60w" - }, - "source": [ - "Using `model_selection=\"best\"`selects the equation with the max score and prints it. But in practice it is best to look through all the equations manually, select an equation above some MSE threshold, and then use the score to select among that loss threshold.\n", - "\n", - "Here, \"score\" is defined by:\n", - "$$ \\text{score} = - \\log(\\text{loss}_i/\\text{loss}_{i-1})/\n", - "(\\text{complexity}_i - \\text{complexity}_{i-1})$$" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "I3IxmvSQrhfw" - }, - "source": [ - "This scoring is motivated by the common strategy of looking for drops in the loss-complexity curve.\n", - "\n", - "From Schmidt & Lipson (2009) -" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eUeXyoLxrd8o" - }, - "source": [ - "![F4.large.jpg]()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gDZyxsA7bDg9" - }, - "source": [ - "# Noise example" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cJCHdDt6IOou" - }, - "source": [ - "Here is an example with noise. Known Gaussian noise with $\\sigma$ between 0.1 and 5.0. We record samples of $y$:\n", - "\n", - "$$ \\sigma \\sim U(0.1, 5.0) $$\n", - "$$ \\epsilon \\sim \\mathcal{N}(0, \\sigma^2)$$\n", - "$$ y = 5\\;\\cos(3.5 x_0) - 1.3 + \\epsilon.$$\n", - "We have 5 features, say. The weights change the loss function to be:\n", - "$$MSE = \\sum [(y - f(x))^2*w],$$\n", - "\n", - "so in this example, we can set:\n", - "$$w = 1/\\sigma^2.$$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "up1RvmwyOdal" - }, - "outputs": [], - "source": [ - "np.random.seed(0)\n", - "N = 3000\n", - "upper_sigma = 5\n", - "X = 2 * np.random.rand(N, 5)\n", - "sigma = np.random.rand(N) * (5 - 0.1) + 0.1\n", - "eps = sigma * np.random.randn(N)\n", - "y = 5 * np.cos(3.5 * X[:, 0]) - 1.3 + eps" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-EJPDZbP5YEZ" - }, - "source": [ - "Let's look at this dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sqMqb4nJ5ZR5" - }, - "outputs": [], - "source": [ - "plt.scatter(X[:, 0], y, alpha=0.2)\n", - "plt.xlabel(\"$x_0$\")\n", - "plt.ylabel(\"$y$\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kaddasbBuDDv" - }, - "source": [ - "Define some weights to use:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3wqz9_sIbDhA" - }, - "outputs": [], - "source": [ - "weights = 1 / sigma ** 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "v8WBYtcZbDhC" - }, - "outputs": [], - "source": [ - "weights[:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NXWdQSCFuAzV" - }, - "source": [ - "Let's run PySR again:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "a07K3KUjOxcp" - }, - "outputs": [], - "source": [ - "model = PySRRegressor(\n", - " loss=\"myloss(x, y, w) = w * abs(x - y)\", # Custom loss function with weights.\n", - " niterations=20,\n", - " populations=20, # Use more populations\n", - " binary_operators=[\"plus\", \"mult\"],\n", - " unary_operators=[\"cos\"],\n", - ")\n", - "model.fit(X, y, weights=weights)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CHCMO9CouFLP" - }, - "source": [ - "Let's see if we get similar results to the true equation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oHyUbcg6ggmx" - }, - "outputs": [], - "source": [ - "model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OchlZZQP8Ums" - }, - "source": [ - "We can also filter all equations up to 2x the most accurate equation, then select the best score from that list:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PB67POLr8b_L" - }, - "outputs": [], - "source": [ - "best_idx = model.equations_.query(\n", - " f\"loss < {2 * model.equations_.loss.min()}\"\n", - ").score.idxmax()\n", - "model.sympy(best_idx)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SRHTP4x55roh" - }, - "source": [ - "We can also use `denoise=True`, which will run the input through a Gaussian process to denoise the dataset, before fitting on it." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eTGQ4NA78yAw" - }, - "source": [ - "Let's look at the fit:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ezCC0IkS8zFf" - }, - "outputs": [], - "source": [ - "plt.scatter(X[:, 0], y, alpha=0.1)\n", - "y_prediction = model.predict(X, index=best_idx)\n", - "plt.scatter(X[:, 0], y_prediction)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Multiple outputs" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For multiple outputs, multiple equations are returned:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X = 2 * np.random.randn(100, 5)\n", - "y = 1 / X[:, [0, 1, 2]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = PySRRegressor(\n", - " binary_operators=[\"+\", \"*\"],\n", - " unary_operators=[\"inv(x) = 1/x\"],\n", - " extra_sympy_mappings={\"inv\": lambda x: 1/x},\n", - ")\n", - "model.fit(X, y)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Julia packages and types" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "PySR uses [SymbolicRegression.jl](https://github.com/MilesCranmer/SymbolicRegression.jl)\n", - "as its search backend. This is a pure Julia package, and so can interface easily with any other\n", - "Julia package.\n", - "For some tasks, it may be necessary to load such a package.\n", - "\n", - "For example, let's say we wish to discovery the following relationship:\n", - "\n", - "$$ y = p_{3x + 1} - 5, $$\n", - "\n", - "where $p_i$ is the $i$th prime number, and $x$ is the input feature.\n", - "\n", - "Let's see if we can discover this using\n", - "the [Primes.jl](https://github.com/JuliaMath/Primes.jl) package.\n", - "\n", - "First, let's get the Julia backend\n", - "Here, we might choose to manually specify unlimited threads, `-O3`,\n", - "and `compile_modules=False`, although this will only propagate if Julia has not yet started:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pysr\n", - "jl = pysr.julia_helpers.init_julia(\n", - " julia_kwargs={\"threads\": \"auto\", \"optimize\": 2, \"compiled_modules\": False}\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "`jl` stores the Julia runtime.\n", - "\n", - "Now, let's run some Julia code to add the Primes.jl\n", - "package to the PySR environment:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "jl.eval(\"\"\"\n", - "import Pkg\n", - "Pkg.add(\"Primes\")\n", - "\"\"\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This imports the Julia package manager, and uses it to install\n", - "`Primes.jl`. Now let's import `Primes.jl`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "jl.eval(\"import Primes\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Now, we define a custom operator:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "jl.eval(\"\"\"\n", - "function p(i::T) where T\n", - " if 0.5 < i < 1000\n", - " return T(Primes.prime(round(Int, i)))\n", - " else\n", - " return T(NaN)\n", - " end\n", - "end\n", - "\"\"\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "We have created a function `p`, which takes a number `i` of type `T` (e.g., `T=Float64`).\n", - "`p` first checks whether the input is between 0.5 and 1000.\n", - "If out-of-bounds, it returns `NaN`.\n", - "If in-bounds, it rounds it to the nearest integer, computes the corresponding prime number, and then\n", - "converts it to the same type as input.\n", - "\n", - "The equivalent function in Python would be:\n", - "\n", - "```python\n", - "import sympy\n", - "\n", - "def p(i):\n", - " if 0.5 < i < 1000:\n", - " return float(sympy.prime(int(round(i))))\n", - " else:\n", - " return float(\"nan\")\n", - "```\n", - "\n", - "(However, note that this version assumes 64-bit float input, rather than any input type `T`)\n", - "\n", - "Next, let's generate a list of primes for our test dataset.\n", - "Since we are using PyJulia, we can just call `p` directly to do this:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "primes = {i: jl.p(i*1.0) for i in range(1, 999)}" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, let's use this list of primes to create a dataset of $x, y$ pairs:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "X = np.random.randint(0, 100, 100)[:, None]\n", - "y = [primes[3*X[i, 0] + 1] - 5 + np.random.randn()*0.001 for i in range(100)]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that we have also added a tiny bit of noise to the dataset.\n", - "\n", - "Finally, let's create a PySR model, and pass the custom operator. We also need to define the sympy equivalent, which we can leave as a placeholder for now:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pysr import PySRRegressor\n", - "import sympy\n", - "\n", - "class sympy_p(sympy.Function):\n", - " pass\n", - "\n", - "model = PySRRegressor(\n", - " binary_operators=[\"+\", \"-\", \"*\", \"/\"],\n", - " unary_operators=[\"p\"],\n", - " niterations=20,\n", - " extra_sympy_mappings={\"p\": sympy_p}\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ee30bd41", - "metadata": {}, - "source": [ - "We are all set to go! Let's see if we can find the true relation:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.fit(X, y)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "if all works out, you should be able to see the true relation (note that the constant offset might not be exactly 1, since it is allowed to round to the nearest integer).\n", - "\n", - "You can get the sympy version of the best equation with:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.sympy()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cPc1EDvRbDhL" - }, - "source": [ - "# High-dimensional input: Neural Nets + Symbolic Regression" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "3hS2kTAbbDhL" - }, - "source": [ - "In this example, let's learn a high-dimensional problem. **This will use the method proposed in our NeurIPS paper: https://arxiv.org/abs/2006.11287.**\n", - "\n", - "Let's consider a time series problem:\n", - "\n", - "$$ z = y^2,\\quad y = \\frac{1}{10} \\sum(y_i),\\quad y_i = x_{i0}^2 + 6 \\cos(2*x_{i2})$$\n", - "\n", - "Imagine our time series is 10 timesteps. That is very hard for symbolic regression, even if we impose the inductive bias of $$z=f(\\sum g(x_i))$$ - it is the square of the number of possible equations!\n", - "\n", - "But, as in our paper, **we can break this problem down into parts with a neural network. Then approximate the neural network with the symbolic regression!**\n", - "\n", - "Then, instead of, say, $(10^9)^2=10^{18}$ equations, we only have to consider $2\\times 10^9$ equations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SXJGXySlbDhL" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "rstate = np.random.RandomState(0)\n", - "\n", - "N = 100000\n", - "Nt = 10\n", - "X = 6 * rstate.rand(N, Nt, 5) - 3\n", - "y_i = X[..., 0] ** 2 + 6 * np.cos(2 * X[..., 2])\n", - "y = np.sum(y_i, axis=1) / y_i.shape[1]\n", - "z = y**2\n", - "X.shape, y.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8ZqGupq_uSgp" - }, - "source": [ - "## Neural Network definition" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "r2NR0h8-bDhN" - }, - "source": [ - "So, as described above, let's first use a neural network with the sum inductive bias to solve this problem.\n", - "\n", - "Essentially, we will learn two neural networks:\n", - "- `f`\n", - "- `g`\n", - "\n", - "each defined as a multi-layer perceptron. We will sum over `g` the same way as in our equation, but we won't define the summed part beforehand.\n", - "\n", - "Then, we will fit `g` and `f` **separately** using symbolic regression." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aca54ffa" - }, - "source": [ - "> **Warning**\n", - ">\n", - "> We import torch *after* already starting PyJulia. This is required due to interference between their C bindings. If you use torch, and then run PyJulia, you will likely hit a segfault. So keep this in mind for mixed deep learning + PyJulia/PySR workflows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nWVfkV_YbDhO" - }, - "outputs": [], - "source": [ - "import torch\n", - "from torch import nn, optim\n", - "from torch.nn import functional as F\n", - "from torch.utils.data import DataLoader, TensorDataset\n", - "import pytorch_lightning as pl\n", - "\n", - "hidden = 128\n", - "total_steps = 30_000\n", - "\n", - "def mlp(size_in, size_out, act=nn.ReLU):\n", - " return nn.Sequential(\n", - " nn.Linear(size_in, hidden),\n", - " act(),\n", - " nn.Linear(hidden, hidden),\n", - " act(),\n", - " nn.Linear(hidden, hidden),\n", - " act(),\n", - " nn.Linear(hidden, size_out),\n", - " )\n", - "\n", - "\n", - "class SumNet(pl.LightningModule):\n", - " def __init__(self):\n", - " super().__init__()\n", - "\n", - " ########################################################\n", - " # The same inductive bias as above!\n", - " self.g = mlp(5, 1)\n", - " self.f = mlp(1, 1)\n", - "\n", - " def forward(self, x):\n", - " y_i = self.g(x)[:, :, 0]\n", - " y = torch.sum(y_i, dim=1, keepdim=True) / y_i.shape[1]\n", - " z = self.f(y)\n", - " return z[:, 0]\n", - "\n", - " ########################################################\n", - "\n", - " # PyTorch Lightning bookkeeping:\n", - " def training_step(self, batch, batch_idx):\n", - " x, z = batch\n", - " predicted_z = self(x)\n", - " loss = F.mse_loss(predicted_z, z)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " return self.training_step(batch, batch_idx)\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=self.max_lr)\n", - " scheduler = {\n", - " \"scheduler\": torch.optim.lr_scheduler.OneCycleLR(\n", - " optimizer,\n", - " max_lr=self.max_lr,\n", - " total_steps=self.trainer.estimated_stepping_batches,\n", - " final_div_factor=1e4,\n", - " ),\n", - " \"interval\": \"step\",\n", - " }\n", - " return [optimizer], [scheduler]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kK725aSEuUvG" - }, - "source": [ - "## Data bookkeeping" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KdWVtWUcbDhQ" - }, - "source": [ - "Put everything into PyTorch and do a train/test split:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0ym19abgbDhR" - }, - "outputs": [], - "source": [ - "from multiprocessing import cpu_count\n", - "Xt = torch.tensor(X).float()\n", - "zt = torch.tensor(z).float()\n", - "X_train, X_test, z_train, z_test = train_test_split(Xt, zt, random_state=0)\n", - "train_set = TensorDataset(X_train, z_train)\n", - "train = DataLoader(train_set, batch_size=128, num_workers=cpu_count(), shuffle=True, pin_memory=True)\n", - "test_set = TensorDataset(X_test, z_test)\n", - "test = DataLoader(test_set, batch_size=256, num_workers=cpu_count(), pin_memory=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3dw_NefuudIq" - }, - "source": [ - "## Train the model with PyTorch Lightning on GPUs:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hhlhLQUBbDhT" - }, - "source": [ - "Start the model:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1ldN0999bDhU" - }, - "outputs": [], - "source": [ - "pl.seed_everything(0)\n", - "model = SumNet()\n", - "model.total_steps = total_steps\n", - "model.max_lr = 1e-2" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WWRsu5A9bDhW" - }, - "source": [ - "PyTorch Lightning trainer object:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "33R2nrv-b62w" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(\n", - " max_steps=total_steps, accelerator=\"gpu\", devices=1\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jh91CukM5CkI" - }, - "source": [ - "Here, we fit the neural network:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "TXZdF8k1bDhY" - }, - "outputs": [], - "source": [ - "trainer.fit(model, train_dataloaders=train, val_dataloaders=test)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uYzk0yU4ulfH" - }, - "source": [ - "## Latent vectors of network\n", - "\n", - "Let's get the input and output of the learned `g` function from the network over some random data:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "s2sQLla5bDhb" - }, - "outputs": [], - "source": [ - "np.random.seed(0)\n", - "idx = np.random.randint(0, 10000, size=1000)\n", - "\n", - "X_for_pysr = Xt[idx]\n", - "y_i_for_pysr = model.g(X_for_pysr)[:, :, 0]\n", - "y_for_pysr = torch.sum(y_i_for_pysr, dim=1) / y_i_for_pysr.shape[1]\n", - "z_for_pysr = zt[idx] # Use true values.\n", - "\n", - "X_for_pysr.shape, y_i_for_pysr.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nCCIvvAGuyFi" - }, - "source": [ - "## Learning over the network:\n", - "\n", - "Now, let's fit `g` using PySR.\n", - "\n", - "> **Warning**\n", - ">\n", - "> First, let's save the data, because sometimes PyTorch and PyJulia's C bindings interfere and cause the colab kernel to crash. If we need to restart, we can just load the data without having to retrain the network:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "nnet_recordings = {\n", - " \"g_input\": X_for_pysr.detach().cpu().numpy().reshape(-1, 5),\n", - " \"g_output\": y_i_for_pysr.detach().cpu().numpy().reshape(-1),\n", - " \"f_input\": y_for_pysr.detach().cpu().numpy().reshape(-1, 1),\n", - " \"f_output\": z_for_pysr.detach().cpu().numpy().reshape(-1),\n", - "}\n", - "\n", - "# Save the data for later use:\n", - "import pickle as pkl\n", - "\n", - "with open(\"nnet_recordings.pkl\", \"wb\") as f:\n", - " pkl.dump(nnet_recordings, f)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can now load the data, including after a crash (be sure to re-run the import cells at the top of this notebook, including the one that starts PyJulia)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle as pkl\n", - "\n", - "nnet_recordings = pkl.load(open(\"nnet_recordings.pkl\", \"rb\"))\n", - "f_input = nnet_recordings[\"f_input\"]\n", - "f_output = nnet_recordings[\"f_output\"]\n", - "g_input = nnet_recordings[\"g_input\"]\n", - "g_output = nnet_recordings[\"g_output\"]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now fit using a subsample of the data (symbolic regression only needs a small sample to find the best equation):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51QdHVSkbDhc" - }, - "outputs": [], - "source": [ - "rstate = np.random.RandomState(0)\n", - "f_sample_idx = rstate.choice(f_input.shape[0], size=500, replace=False)\n", - "\n", - "model = PySRRegressor(\n", - " niterations=20,\n", - " binary_operators=[\"plus\", \"sub\", \"mult\"],\n", - " unary_operators=[\"cos\", \"square\", \"neg\"],\n", - ")\n", - "model.fit(g_input[f_sample_idx], g_output[f_sample_idx])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1a738a33" - }, - "source": [ - "If this segfaults, restart the notebook, and run the initial imports and PyJulia part, but skip the PyTorch training. This is because PyTorch's C binding tends to interefere with PyJulia. You can then re-run the `pkl.load` cell to import the data." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xginVMmTu3MZ" - }, - "source": [ - "## Validation" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "6WuaeqyqbDhe" - }, - "source": [ - "Recall we are searching for $f$ and $g$ such that:\n", - "$$z=f(\\sum g(x_i))$$ \n", - "which approximates the true relation:\n", - "$$ z = y^2,\\quad y = \\frac{1}{10} \\sum(y_i),\\quad y_i = x_{i0}^2 + 6 \\cos(2 x_{i2})$$\n", - "\n", - "Let's see how well we did in recovering $g$:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "E1_VWQ45bDhf" - }, - "outputs": [], - "source": [ - "model.equations_[[\"complexity\", \"loss\", \"equation\"]]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mlU1hidZkgCY" - }, - "source": [ - "A neural network can easily undo a linear transform (which commutes with the summation), so any affine transform in $g$ is to be expected. The network for $f$ has learned to undo the linear transform.\n", - "\n", - "This likely won't find the exact result, but it should find something similar. You may wish to try again but with many more `total_steps` for the neural network (10,000 is quite small!).\n", - "\n", - "Then, we can learn another analytic equation for $f$." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TntGlQEwbDhk" - }, - "source": [ - "**Now, we can compose these together to get the time series model!**\n", - "\n", - "Think about what we just did: we found an analytical equation for $z$ in terms of $500$ datapoints, under the assumption that $z$ is a function of a sum of another function over an axis:\n", - "\n", - "$$ z = f(\\sum_i g(x_i)) $$\n", - "\n", - "And we pulled out analytical copies for $g$ using symbolic regression." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1QsHVjAVbDhk" - }, - "source": [ - "# Other PySR Options" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "S5dO61g1bDhk" - }, - "source": [ - "The full list of PySR parameters can be found here: https://astroautomata.com/PySR/api" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "pysr_demo.ipynb", - "provenance": [] - }, - "gpuClass": "standard", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "DS4E1PagbDgL" + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tQ1r1bbb0yBv" + }, + "source": [ + "\n", + "## Instructions\n", + "1. Work on a copy of this notebook: _File_ > _Save a copy in Drive_ (you will need a Google account).\n", + "2. (Optional) If you would like to do the deep learning component of this tutorial, turn on the GPU with Edit->Notebook settings->Hardware accelerator->GPU\n", + "3. Execute the following cell (click on it and press Ctrl+Enter) to install Julia, IJulia and other packages (if needed, update `JULIA_VERSION` and the other parameters). This takes a couple of minutes.\n", + "4. Continue to the next section.\n", + "\n", + "_Notes_:\n", + "* If your Colab Runtime gets reset (e.g., due to inactivity), repeat steps 3, 4.\n", + "* After installation, if you want to change the Julia version or activate/deactivate the GPU, you will need to reset the Runtime: _Runtime_ > _Delete and disconnect runtime_ and repeat steps 2-4." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "COndi88gbDgO" + }, + "source": [ + "**Run the following code to install Julia**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GIeFXS0F0zww" + }, + "outputs": [], + "source": [ + "%%shell\n", + "set -e\n", + "\n", + "#---------------------------------------------------#\n", + "JULIA_VERSION=\"1.8.5\"\n", + "export JULIA_PKG_PRECOMPILE_AUTO=0\n", + "#---------------------------------------------------#\n", + "\n", + "if [ -z `which julia` ]; then\n", + " # Install Julia\n", + " JULIA_VER=`cut -d '.' -f -2 <<< \"$JULIA_VERSION\"`\n", + " echo \"Installing Julia $JULIA_VERSION on the current Colab Runtime...\"\n", + " BASE_URL=\"https://julialang-s3.julialang.org/bin/linux/x64\"\n", + " URL=\"$BASE_URL/$JULIA_VER/julia-$JULIA_VERSION-linux-x86_64.tar.gz\"\n", + " wget -nv $URL -O /tmp/julia.tar.gz # -nv means \"not verbose\"\n", + " tar -x -f /tmp/julia.tar.gz -C /usr/local --strip-components 1\n", + " rm /tmp/julia.tar.gz\n", + "\n", + " echo \"Installing PyCall.jl...\"\n", + " julia -e 'using Pkg; Pkg.add(\"PyCall\"); Pkg.build(\"PyCall\")'\n", + " julia -e 'println(\"Success\")'\n", + "\n", + "fi" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ORv1c6xvbDgV" + }, + "source": [ + "Install PySR and PyTorch-Lightning:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EhMRSZEYFPLz" + }, + "outputs": [], + "source": [ + "%pip install -Uq pysr pytorch_lightning" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "etTMEV0wDqld" + }, + "source": [ + "The following step is not normally required, but colab's printing is non-standard and we need to manually set it up PyJulia:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "j666aOI8xWF_" + }, + "outputs": [], + "source": [ + "from julia import Julia\n", + "\n", + "julia = Julia(compiled_modules=False, threads=\"auto\")\n", + "from julia import Main\n", + "from julia.tools import redirect_output_streams\n", + "\n", + "redirect_output_streams()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u2WhbVhht-G" + }, + "source": [ + "Let's install the backend of PySR, and all required libraries.\n", + "\n", + "**(This may take some time)**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "J-0QbxyK1_51" + }, + "outputs": [], + "source": [ + "import pysr\n", + "\n", + "# We don't precompile in colab because compiled modules are incompatible static Python libraries:\n", + "pysr.install(precompile=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vFpyRxmhFqeH" + }, + "outputs": [], + "source": [ + "import sympy\n", + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "from pysr import PySRRegressor\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gsRMQ7grbDga" + }, + "source": [ + "# Simple PySR example:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "myTEwdiUFiGL" + }, + "source": [ + "First, let's learn a simple function\n", + "\n", + "$$2.5382 \\cos(x3) + x0^2 - 2$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Cb1eb2XuFQh8" + }, + "outputs": [], + "source": [ + "# Dataset\n", + "np.random.seed(0)\n", + "X = 2 * np.random.randn(100, 5)\n", + "y = 2.5382 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cturCkaVjzLs" + }, + "source": [ + "By default, we will set up 30 populations of expressions (which evolve independently except for migrations), use 4 threads, and use `\"best\"` for our model selection strategy:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4nDAAnisdhTc" + }, + "outputs": [], + "source": [ + "default_pysr_params = dict(\n", + " populations=30,\n", + " model_selection=\"best\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N4gANfkaj8ie" + }, + "source": [ + "PySR can run for arbitrarily long, and continue to find more and more accurate expressions. You can set the total number of cycles of evolution with `niterations`, although there are also a [few more ways](https://github.com/MilesCranmer/PySR/pull/134) to stop execution.\n", + "\n", + "**This first execution will take a bit longer to startup, as the library is JIT-compiled. The next execution will be much faster.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "p4PSrO-NK1Wa" + }, + "outputs": [], + "source": [ + "# Learn equations\n", + "model = PySRRegressor(\n", + " niterations=30,\n", + " binary_operators=[\"plus\", \"mult\"],\n", + " unary_operators=[\"cos\", \"exp\", \"sin\"],\n", + " **default_pysr_params\n", + ")\n", + "\n", + "model.fit(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-bsAECbdkQsQ" + }, + "source": [ + "We can print the model, which will print out all the discovered expressions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4HR8gknlZz4W" + }, + "outputs": [], + "source": [ + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ME3ddPxXkWQg" + }, + "source": [ + "We can also view the SymPy format of the best expression:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IQKOohdpztS7" + }, + "outputs": [], + "source": [ + "model.sympy()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EHIIPlmClltn" + }, + "source": [ + "We can also view the SymPy of any other expression in the list, using the index of it in `model.equations_`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GRcxq-TTlpRX" + }, + "outputs": [], + "source": [ + "model.sympy(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YMugcGX4tbqj" + }, + "source": [ + "## Output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gIWt5wz5cjXE" + }, + "source": [ + "`model.equations_` is a Pandas DataFrame. We can export the results in various ways:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HFGaNL6tbDgi" + }, + "outputs": [], + "source": [ + "model.latex()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4hS8kqutcmPQ" + }, + "source": [ + "These is also `model.sympy(), model.jax(), model.pytorch()`. All of these can take an index as input, to get the result for an arbitrary equation in the list.\n", + "\n", + "We can also use `model.predict` for arbitrary equations, with the default equation being the one chosen by `model_selection`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Vbz4IMsk2NYH" + }, + "outputs": [], + "source": [ + "ypredict = model.predict(X)\n", + "ypredict_simpler = model.predict(X, 2)\n", + "\n", + "print(\"Default selection MSE:\", np.power(ypredict - y, 2).mean())\n", + "print(\"Manual selection MSE for index 2:\", np.power(ypredict_simpler - y, 2).mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SQDUScGebDgr" + }, + "source": [ + "# Custom operators" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qvgVbOoSFtQY" + }, + "source": [ + "A full list of operators is given here: https://astroautomata.com/PySR/operators,\n", + "but we can also use any binary or unary operator in `julia`, or define our own as arbitrary functions.\n", + "\n", + "Say that we want a command to do quartic powers:\n", + "\n", + "$$ y = x_0^4 - 2 $$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JvXOVqSyFsdr" + }, + "outputs": [], + "source": [ + "y = X[:, 0] ** 4 - 2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-zoqaL8KGSK5" + }, + "source": [ + "We can do this by passing a string in Julia syntax.\n", + "\n", + "We also define the operator in sympy, with `extra_sympy_mappings`, to enable its use in `predict`, and other export functions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PoEkpvYuGUdy" + }, + "outputs": [], + "source": [ + "model = PySRRegressor(\n", + " niterations=5,\n", + " populations=40,\n", + " binary_operators=[\"plus\", \"mult\"],\n", + " unary_operators=[\"cos\", \"exp\", \"sin\", \"quart(x) = x^4\"],\n", + " extra_sympy_mappings={\"quart\": lambda x: x**4},\n", + ")\n", + "model.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "emn2IajKbDgy" + }, + "outputs": [], + "source": [ + "model.sympy()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wbWHyOjl2_kX" + }, + "source": [ + "Since `quart` is arguably more complex than the other operators, you can also give it a different complexity, using, e.g., `complexity_of_operators={\"quart\": 2}` to give it a complexity of 2 (instead of the default 2). You can also define custom complexities for variables and constants (`complexity_of_variables` and `complexity_of_constants`, respectively - both take a single number).\n", + "\n", + "\n", + "One can also add a binary operator, with, e.g., `\"myoperator(x, y) = x^2 * y\"`. All Julia operators that work on scalar 32-bit floating point values are available.\n", + "\n", + "Make sure that any operator you add is valid over the real line. So, e.g., you will need to define `\"mysqrt(x) = sqrt(abs(x))\"` to enable it for negative numbers,\n", + "or, simply have it return a very large number for bad inputs (to prevent negative input in a soft way):\n", + "`\"mysqrt(x::T) where {T} = (x >= 0) ? x : T(-1e9)\"` (Julia syntax for a template function of input type `T`), which will make `mysqrt(x)` return -10^9 for negative x–hurting the loss of the equation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pEXT4xskbDg0" + }, + "source": [ + "## Scoring" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IyeYbVVOG60w" + }, + "source": [ + "Using `model_selection=\"best\"`selects the equation with the max score and prints it. But in practice it is best to look through all the equations manually, select an equation above some MSE threshold, and then use the score to select among that loss threshold.\n", + "\n", + "Here, \"score\" is defined by:\n", + "$$ \\text{score} = - \\log(\\text{loss}_i/\\text{loss}_{i-1})/\n", + "(\\text{complexity}_i - \\text{complexity}_{i-1})$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "I3IxmvSQrhfw" + }, + "source": [ + "This scoring is motivated by the common strategy of looking for drops in the loss-complexity curve.\n", + "\n", + "From Schmidt & Lipson (2009) -" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eUeXyoLxrd8o" + }, + "source": [ + "![F4.large.jpg]()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gDZyxsA7bDg9" + }, + "source": [ + "# Noise example" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cJCHdDt6IOou" + }, + "source": [ + "Here is an example with noise. Known Gaussian noise with $\\sigma$ between 0.1 and 5.0. We record samples of $y$:\n", + "\n", + "$$ \\sigma \\sim U(0.1, 5.0) $$\n", + "$$ \\epsilon \\sim \\mathcal{N}(0, \\sigma^2)$$\n", + "$$ y = 5\\;\\cos(3.5 x_0) - 1.3 + \\epsilon.$$\n", + "We have 5 features, say. The weights change the loss function to be:\n", + "$$MSE = \\sum [(y - f(x))^2*w],$$\n", + "\n", + "so in this example, we can set:\n", + "$$w = 1/\\sigma^2.$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "up1RvmwyOdal" + }, + "outputs": [], + "source": [ + "np.random.seed(0)\n", + "N = 3000\n", + "upper_sigma = 5\n", + "X = 2 * np.random.rand(N, 5)\n", + "sigma = np.random.rand(N) * (5 - 0.1) + 0.1\n", + "eps = sigma * np.random.randn(N)\n", + "y = 5 * np.cos(3.5 * X[:, 0]) - 1.3 + eps" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-EJPDZbP5YEZ" + }, + "source": [ + "Let's look at this dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sqMqb4nJ5ZR5" + }, + "outputs": [], + "source": [ + "plt.scatter(X[:, 0], y, alpha=0.2)\n", + "plt.xlabel(\"$x_0$\")\n", + "plt.ylabel(\"$y$\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kaddasbBuDDv" + }, + "source": [ + "Define some weights to use:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3wqz9_sIbDhA" + }, + "outputs": [], + "source": [ + "weights = 1 / sigma**2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "v8WBYtcZbDhC" + }, + "outputs": [], + "source": [ + "weights[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NXWdQSCFuAzV" + }, + "source": [ + "Let's run PySR again:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "a07K3KUjOxcp" + }, + "outputs": [], + "source": [ + "model = PySRRegressor(\n", + " loss=\"myloss(x, y, w) = w * abs(x - y)\", # Custom loss function with weights.\n", + " niterations=20,\n", + " populations=20, # Use more populations\n", + " binary_operators=[\"plus\", \"mult\"],\n", + " unary_operators=[\"cos\"],\n", + ")\n", + "model.fit(X, y, weights=weights)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CHCMO9CouFLP" + }, + "source": [ + "Let's see if we get similar results to the true equation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oHyUbcg6ggmx" + }, + "outputs": [], + "source": [ + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OchlZZQP8Ums" + }, + "source": [ + "We can also filter all equations up to 2x the most accurate equation, then select the best score from that list:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PB67POLr8b_L" + }, + "outputs": [], + "source": [ + "best_idx = model.equations_.query(\n", + " f\"loss < {2 * model.equations_.loss.min()}\"\n", + ").score.idxmax()\n", + "model.sympy(best_idx)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SRHTP4x55roh" + }, + "source": [ + "We can also use `denoise=True`, which will run the input through a Gaussian process to denoise the dataset, before fitting on it." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eTGQ4NA78yAw" + }, + "source": [ + "Let's look at the fit:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ezCC0IkS8zFf" + }, + "outputs": [], + "source": [ + "plt.scatter(X[:, 0], y, alpha=0.1)\n", + "y_prediction = model.predict(X, index=best_idx)\n", + "plt.scatter(X[:, 0], y_prediction)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiple outputs" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For multiple outputs, multiple equations are returned:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = 2 * np.random.randn(100, 5)\n", + "y = 1 / X[:, [0, 1, 2]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = PySRRegressor(\n", + " binary_operators=[\"+\", \"*\"],\n", + " unary_operators=[\"inv(x) = 1/x\"],\n", + " extra_sympy_mappings={\"inv\": lambda x: 1 / x},\n", + ")\n", + "model.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Julia packages and types" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "PySR uses [SymbolicRegression.jl](https://github.com/MilesCranmer/SymbolicRegression.jl)\n", + "as its search backend. This is a pure Julia package, and so can interface easily with any other\n", + "Julia package.\n", + "For some tasks, it may be necessary to load such a package.\n", + "\n", + "For example, let's say we wish to discovery the following relationship:\n", + "\n", + "$$ y = p_{3x + 1} - 5, $$\n", + "\n", + "where $p_i$ is the $i$th prime number, and $x$ is the input feature.\n", + "\n", + "Let's see if we can discover this using\n", + "the [Primes.jl](https://github.com/JuliaMath/Primes.jl) package.\n", + "\n", + "First, let's get the Julia backend\n", + "Here, we might choose to manually specify unlimited threads, `-O3`,\n", + "and `compile_modules=False`, although this will only propagate if Julia has not yet started:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pysr\n", + "\n", + "jl = pysr.julia_helpers.init_julia(\n", + " julia_kwargs={\"threads\": \"auto\", \"optimize\": 2, \"compiled_modules\": False}\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "`jl` stores the Julia runtime.\n", + "\n", + "Now, let's run some Julia code to add the Primes.jl\n", + "package to the PySR environment:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jl.eval(\n", + " \"\"\"\n", + "import Pkg\n", + "Pkg.add(\"Primes\")\n", + "\"\"\"\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This imports the Julia package manager, and uses it to install\n", + "`Primes.jl`. Now let's import `Primes.jl`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jl.eval(\"import Primes\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Now, we define a custom operator:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "jl.eval(\n", + " \"\"\"\n", + "function p(i::T) where T\n", + " if 0.5 < i < 1000\n", + " return T(Primes.prime(round(Int, i)))\n", + " else\n", + " return T(NaN)\n", + " end\n", + "end\n", + "\"\"\"\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "We have created a function `p`, which takes a number `i` of type `T` (e.g., `T=Float64`).\n", + "`p` first checks whether the input is between 0.5 and 1000.\n", + "If out-of-bounds, it returns `NaN`.\n", + "If in-bounds, it rounds it to the nearest integer, computes the corresponding prime number, and then\n", + "converts it to the same type as input.\n", + "\n", + "The equivalent function in Python would be:\n", + "\n", + "```python\n", + "import sympy\n", + "\n", + "def p(i):\n", + " if 0.5 < i < 1000:\n", + " return float(sympy.prime(int(round(i))))\n", + " else:\n", + " return float(\"nan\")\n", + "```\n", + "\n", + "(However, note that this version assumes 64-bit float input, rather than any input type `T`)\n", + "\n", + "Next, let's generate a list of primes for our test dataset.\n", + "Since we are using PyJulia, we can just call `p` directly to do this:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "primes = {i: jl.p(i * 1.0) for i in range(1, 999)}" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's use this list of primes to create a dataset of $x, y$ pairs:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "X = np.random.randint(0, 100, 100)[:, None]\n", + "y = [primes[3 * X[i, 0] + 1] - 5 + np.random.randn() * 0.001 for i in range(100)]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we have also added a tiny bit of noise to the dataset.\n", + "\n", + "Finally, let's create a PySR model, and pass the custom operator. We also need to define the sympy equivalent, which we can leave as a placeholder for now:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pysr import PySRRegressor\n", + "import sympy\n", + "\n", + "\n", + "class sympy_p(sympy.Function):\n", + " pass\n", + "\n", + "\n", + "model = PySRRegressor(\n", + " binary_operators=[\"+\", \"-\", \"*\", \"/\"],\n", + " unary_operators=[\"p\"],\n", + " niterations=20,\n", + " extra_sympy_mappings={\"p\": sympy_p},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ee30bd41", + "metadata": {}, + "source": [ + "We are all set to go! Let's see if we can find the true relation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.fit(X, y)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "if all works out, you should be able to see the true relation (note that the constant offset might not be exactly 1, since it is allowed to round to the nearest integer).\n", + "\n", + "You can get the sympy version of the best equation with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.sympy()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cPc1EDvRbDhL" + }, + "source": [ + "# High-dimensional input: Neural Nets + Symbolic Regression" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "3hS2kTAbbDhL" + }, + "source": [ + "In this example, let's learn a high-dimensional problem. **This will use the method proposed in our NeurIPS paper: https://arxiv.org/abs/2006.11287.**\n", + "\n", + "Let's consider a time series problem:\n", + "\n", + "$$ z = y^2,\\quad y = \\frac{1}{10} \\sum(y_i),\\quad y_i = x_{i0}^2 + 6 \\cos(2*x_{i2})$$\n", + "\n", + "Imagine our time series is 10 timesteps. That is very hard for symbolic regression, even if we impose the inductive bias of $$z=f(\\sum g(x_i))$$ - it is the square of the number of possible equations!\n", + "\n", + "But, as in our paper, **we can break this problem down into parts with a neural network. Then approximate the neural network with the symbolic regression!**\n", + "\n", + "Then, instead of, say, $(10^9)^2=10^{18}$ equations, we only have to consider $2\\times 10^9$ equations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SXJGXySlbDhL" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "rstate = np.random.RandomState(0)\n", + "\n", + "N = 100000\n", + "Nt = 10\n", + "X = 6 * rstate.rand(N, Nt, 5) - 3\n", + "y_i = X[..., 0] ** 2 + 6 * np.cos(2 * X[..., 2])\n", + "y = np.sum(y_i, axis=1) / y_i.shape[1]\n", + "z = y**2\n", + "X.shape, y.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8ZqGupq_uSgp" + }, + "source": [ + "## Neural Network definition" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r2NR0h8-bDhN" + }, + "source": [ + "So, as described above, let's first use a neural network with the sum inductive bias to solve this problem.\n", + "\n", + "Essentially, we will learn two neural networks:\n", + "- `f`\n", + "- `g`\n", + "\n", + "each defined as a multi-layer perceptron. We will sum over `g` the same way as in our equation, but we won't define the summed part beforehand.\n", + "\n", + "Then, we will fit `g` and `f` **separately** using symbolic regression." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aca54ffa" + }, + "source": [ + "> **Warning**\n", + ">\n", + "> We import torch *after* already starting PyJulia. This is required due to interference between their C bindings. If you use torch, and then run PyJulia, you will likely hit a segfault. So keep this in mind for mixed deep learning + PyJulia/PySR workflows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nWVfkV_YbDhO" + }, + "outputs": [], + "source": [ + "import torch\n", + "from torch import nn, optim\n", + "from torch.nn import functional as F\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "import pytorch_lightning as pl\n", + "\n", + "hidden = 128\n", + "total_steps = 30_000\n", + "\n", + "\n", + "def mlp(size_in, size_out, act=nn.ReLU):\n", + " return nn.Sequential(\n", + " nn.Linear(size_in, hidden),\n", + " act(),\n", + " nn.Linear(hidden, hidden),\n", + " act(),\n", + " nn.Linear(hidden, hidden),\n", + " act(),\n", + " nn.Linear(hidden, size_out),\n", + " )\n", + "\n", + "\n", + "class SumNet(pl.LightningModule):\n", + " def __init__(self):\n", + " super().__init__()\n", + "\n", + " ########################################################\n", + " # The same inductive bias as above!\n", + " self.g = mlp(5, 1)\n", + " self.f = mlp(1, 1)\n", + "\n", + " def forward(self, x):\n", + " y_i = self.g(x)[:, :, 0]\n", + " y = torch.sum(y_i, dim=1, keepdim=True) / y_i.shape[1]\n", + " z = self.f(y)\n", + " return z[:, 0]\n", + "\n", + " ########################################################\n", + "\n", + " # PyTorch Lightning bookkeeping:\n", + " def training_step(self, batch, batch_idx):\n", + " x, z = batch\n", + " predicted_z = self(x)\n", + " loss = F.mse_loss(predicted_z, z)\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + " return self.training_step(batch, batch_idx)\n", + "\n", + " def configure_optimizers(self):\n", + " optimizer = torch.optim.Adam(self.parameters(), lr=self.max_lr)\n", + " scheduler = {\n", + " \"scheduler\": torch.optim.lr_scheduler.OneCycleLR(\n", + " optimizer,\n", + " max_lr=self.max_lr,\n", + " total_steps=self.trainer.estimated_stepping_batches,\n", + " final_div_factor=1e4,\n", + " ),\n", + " \"interval\": \"step\",\n", + " }\n", + " return [optimizer], [scheduler]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kK725aSEuUvG" + }, + "source": [ + "## Data bookkeeping" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KdWVtWUcbDhQ" + }, + "source": [ + "Put everything into PyTorch and do a train/test split:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0ym19abgbDhR" + }, + "outputs": [], + "source": [ + "from multiprocessing import cpu_count\n", + "\n", + "Xt = torch.tensor(X).float()\n", + "zt = torch.tensor(z).float()\n", + "X_train, X_test, z_train, z_test = train_test_split(Xt, zt, random_state=0)\n", + "train_set = TensorDataset(X_train, z_train)\n", + "train = DataLoader(\n", + " train_set, batch_size=128, num_workers=cpu_count(), shuffle=True, pin_memory=True\n", + ")\n", + "test_set = TensorDataset(X_test, z_test)\n", + "test = DataLoader(test_set, batch_size=256, num_workers=cpu_count(), pin_memory=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3dw_NefuudIq" + }, + "source": [ + "## Train the model with PyTorch Lightning on GPUs:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hhlhLQUBbDhT" + }, + "source": [ + "Start the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1ldN0999bDhU" + }, + "outputs": [], + "source": [ + "pl.seed_everything(0)\n", + "model = SumNet()\n", + "model.total_steps = total_steps\n", + "model.max_lr = 1e-2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WWRsu5A9bDhW" + }, + "source": [ + "PyTorch Lightning trainer object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "33R2nrv-b62w" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(max_steps=total_steps, accelerator=\"gpu\", devices=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jh91CukM5CkI" + }, + "source": [ + "Here, we fit the neural network:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TXZdF8k1bDhY" + }, + "outputs": [], + "source": [ + "trainer.fit(model, train_dataloaders=train, val_dataloaders=test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uYzk0yU4ulfH" + }, + "source": [ + "## Latent vectors of network\n", + "\n", + "Let's get the input and output of the learned `g` function from the network over some random data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "s2sQLla5bDhb" + }, + "outputs": [], + "source": [ + "np.random.seed(0)\n", + "idx = np.random.randint(0, 10000, size=1000)\n", + "\n", + "X_for_pysr = Xt[idx]\n", + "y_i_for_pysr = model.g(X_for_pysr)[:, :, 0]\n", + "y_for_pysr = torch.sum(y_i_for_pysr, dim=1) / y_i_for_pysr.shape[1]\n", + "z_for_pysr = zt[idx] # Use true values.\n", + "\n", + "X_for_pysr.shape, y_i_for_pysr.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nCCIvvAGuyFi" + }, + "source": [ + "## Learning over the network:\n", + "\n", + "Now, let's fit `g` using PySR.\n", + "\n", + "> **Warning**\n", + ">\n", + "> First, let's save the data, because sometimes PyTorch and PyJulia's C bindings interfere and cause the colab kernel to crash. If we need to restart, we can just load the data without having to retrain the network:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nnet_recordings = {\n", + " \"g_input\": X_for_pysr.detach().cpu().numpy().reshape(-1, 5),\n", + " \"g_output\": y_i_for_pysr.detach().cpu().numpy().reshape(-1),\n", + " \"f_input\": y_for_pysr.detach().cpu().numpy().reshape(-1, 1),\n", + " \"f_output\": z_for_pysr.detach().cpu().numpy().reshape(-1),\n", + "}\n", + "\n", + "# Save the data for later use:\n", + "import pickle as pkl\n", + "\n", + "with open(\"nnet_recordings.pkl\", \"wb\") as f:\n", + " pkl.dump(nnet_recordings, f)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now load the data, including after a crash (be sure to re-run the import cells at the top of this notebook, including the one that starts PyJulia)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle as pkl\n", + "\n", + "nnet_recordings = pkl.load(open(\"nnet_recordings.pkl\", \"rb\"))\n", + "f_input = nnet_recordings[\"f_input\"]\n", + "f_output = nnet_recordings[\"f_output\"]\n", + "g_input = nnet_recordings[\"g_input\"]\n", + "g_output = nnet_recordings[\"g_output\"]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now fit using a subsample of the data (symbolic regression only needs a small sample to find the best equation):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51QdHVSkbDhc" + }, + "outputs": [], + "source": [ + "rstate = np.random.RandomState(0)\n", + "f_sample_idx = rstate.choice(f_input.shape[0], size=500, replace=False)\n", + "\n", + "model = PySRRegressor(\n", + " niterations=20,\n", + " binary_operators=[\"plus\", \"sub\", \"mult\"],\n", + " unary_operators=[\"cos\", \"square\", \"neg\"],\n", + ")\n", + "model.fit(g_input[f_sample_idx], g_output[f_sample_idx])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1a738a33" + }, + "source": [ + "If this segfaults, restart the notebook, and run the initial imports and PyJulia part, but skip the PyTorch training. This is because PyTorch's C binding tends to interefere with PyJulia. You can then re-run the `pkl.load` cell to import the data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xginVMmTu3MZ" + }, + "source": [ + "## Validation" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "6WuaeqyqbDhe" + }, + "source": [ + "Recall we are searching for $f$ and $g$ such that:\n", + "$$z=f(\\sum g(x_i))$$ \n", + "which approximates the true relation:\n", + "$$ z = y^2,\\quad y = \\frac{1}{10} \\sum(y_i),\\quad y_i = x_{i0}^2 + 6 \\cos(2 x_{i2})$$\n", + "\n", + "Let's see how well we did in recovering $g$:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "E1_VWQ45bDhf" + }, + "outputs": [], + "source": [ + "model.equations_[[\"complexity\", \"loss\", \"equation\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mlU1hidZkgCY" + }, + "source": [ + "A neural network can easily undo a linear transform (which commutes with the summation), so any affine transform in $g$ is to be expected. The network for $f$ has learned to undo the linear transform.\n", + "\n", + "This likely won't find the exact result, but it should find something similar. You may wish to try again but with many more `total_steps` for the neural network (10,000 is quite small!).\n", + "\n", + "Then, we can learn another analytic equation for $f$." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TntGlQEwbDhk" + }, + "source": [ + "**Now, we can compose these together to get the time series model!**\n", + "\n", + "Think about what we just did: we found an analytical equation for $z$ in terms of $500$ datapoints, under the assumption that $z$ is a function of a sum of another function over an axis:\n", + "\n", + "$$ z = f(\\sum_i g(x_i)) $$\n", + "\n", + "And we pulled out analytical copies for $g$ using symbolic regression." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1QsHVjAVbDhk" + }, + "source": [ + "# Other PySR Options" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S5dO61g1bDhk" + }, + "source": [ + "The full list of PySR parameters can be found here: https://astroautomata.com/PySR/api" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "name": "pysr_demo.ipynb", + "provenance": [] + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/mkdocs.yml b/mkdocs.yml index dc92d026631d760f102ce04ce7b0886a91300db0..8199745ae71bdfc4fd397f8ae6a3b1368b9f20b2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -13,7 +13,7 @@ theme: toggle: icon: material/toggle-switch-off-outline name: Switch to light mode - + features: - navigation.expand diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..5d7bf33d948c45d40befd4769d0c15f74615472b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.isort] +profile = "black" diff --git a/pysr/__init__.py b/pysr/__init__.py index d9f9229f75a587d0f02ca8d1a0065a5dc33c9a12..5f2200356cf4810cbd8cfc941ae1032732e51ca4 100644 --- a/pysr/__init__.py +++ b/pysr/__init__.py @@ -1,14 +1,23 @@ from . import sklearn_monkeypatch -from .version import __version__ -from .sr import ( - pysr, - PySRRegressor, - best, - best_tex, - best_callable, - best_row, -) -from .julia_helpers import install -from .feynman_problems import Problem, FeynmanProblem from .export_jax import sympy2jax from .export_torch import sympy2torch +from .feynman_problems import FeynmanProblem, Problem +from .julia_helpers import install +from .sr import PySRRegressor, best, best_callable, best_row, best_tex, pysr +from .version import __version__ + +__all__ = [ + "sklearn_monkeypatch", + "sympy2jax", + "sympy2torch", + "FeynmanProblem", + "Problem", + "install", + "PySRRegressor", + "best", + "best_callable", + "best_row", + "best_tex", + "pysr", + "__version__", +] diff --git a/pysr/_cli/main.py b/pysr/_cli/main.py index 9d3551d98eb311c500e9cfb7cc7a0addb57cf5c8..d82ab79edf7987362e1c86183f4f112e5787e0b9 100644 --- a/pysr/_cli/main.py +++ b/pysr/_cli/main.py @@ -1,4 +1,5 @@ import click + from ..julia_helpers import install diff --git a/pysr/export_jax.py b/pysr/export_jax.py index 6570c3b67d626a5eae9633e65f72daefe2877b68..e1730ca4886fe5cede3cf3d75cda90d3a94e9805 100644 --- a/pysr/export_jax.py +++ b/pysr/export_jax.py @@ -1,7 +1,4 @@ -import functools as ft import sympy -import string -import random # Special since need to reduce arguments. MUL = 0 diff --git a/pysr/export_latex.py b/pysr/export_latex.py index 4e597157f11a7338c4a6a126eb86a9f18df418f0..a9a91d762c1b4b211c5f1f89ac05bc419d3cb0a0 100644 --- a/pysr/export_latex.py +++ b/pysr/export_latex.py @@ -1,9 +1,9 @@ """Functions to help export PySR equations to LaTeX.""" +from typing import List + +import pandas as pd import sympy from sympy.printing.latex import LatexPrinter -import pandas as pd -from typing import List -import warnings class PreciseLatexPrinter(LatexPrinter): diff --git a/pysr/export_numpy.py b/pysr/export_numpy.py index 5e2035935775b9a4e3884e79577ddba856e93e56..63e23dc6b9e93afda7680a396ebdd318404940fc 100644 --- a/pysr/export_numpy.py +++ b/pysr/export_numpy.py @@ -1,8 +1,9 @@ """Code for exporting discovered expressions to numpy""" +import warnings + import numpy as np import pandas as pd from sympy import lambdify -import warnings class CallableEquation: diff --git a/pysr/export_torch.py b/pysr/export_torch.py index 12ebe873199c7ea445850576e3a2c048119d8010..7fcb67e82f705485fadbbdeab9b22fab17f8fd06 100644 --- a/pysr/export_torch.py +++ b/pysr/export_torch.py @@ -5,6 +5,7 @@ import collections as co import functools as ft + import sympy diff --git a/pysr/feynman_problems.py b/pysr/feynman_problems.py index cdcc4cdba6c292cb23dc8e0bc103796c3c2dda2d..a264a901b597dc2b2846fba76f9dbf213cd79df6 100644 --- a/pysr/feynman_problems.py +++ b/pysr/feynman_problems.py @@ -1,8 +1,10 @@ -import numpy as np import csv -from .sr import pysr, best -from pathlib import Path from functools import partial +from pathlib import Path + +import numpy as np + +from .sr import best, pysr PKG_DIR = Path(__file__).parents[1] FEYNMAN_DATASET = PKG_DIR / "datasets" / "FeynmanEquations.csv" @@ -118,6 +120,7 @@ def do_feynman_experiments_parallel( data_dir=FEYNMAN_DATASET, ): import multiprocessing as mp + from tqdm import tqdm problems = mk_problems(first=first, gen=True, dp=dp, data_dir=data_dir) diff --git a/pysr/julia_helpers.py b/pysr/julia_helpers.py index 566ba98aa764571a690d01e3cbd93715f1f079f7..7918c5e6d254ed0a9df08a13d4deee971c3cece9 100644 --- a/pysr/julia_helpers.py +++ b/pysr/julia_helpers.py @@ -1,12 +1,13 @@ """Functions for initializing the Julia environment and installing deps.""" -import sys +import os import subprocess +import sys import warnings from pathlib import Path -import os + from julia.api import JuliaError -from .version import __version__, __symbolic_regression_jl_version__ +from .version import __symbolic_regression_jl_version__, __version__ juliainfo = None julia_initialized = False diff --git a/pysr/sr.py b/pysr/sr.py index 056d0c8af3c37d6b077fcea7b35ae488f4a48d05..8638ac0e67c629a5b9f38bd56701ddc8f65984b1 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1,40 +1,37 @@ """Define the PySRRegressor scikit-learn interface.""" import copy -from io import StringIO import os -import sys -import numpy as np -import pandas as pd -import sympy -from sympy import sympify +import pickle as pkl import re -import tempfile import shutil -from pathlib import Path -import pickle as pkl -from datetime import datetime +import sys +import tempfile import warnings +from datetime import datetime +from io import StringIO from multiprocessing import cpu_count -from sklearn.base import BaseEstimator, RegressorMixin, MultiOutputMixin +from pathlib import Path + +import numpy as np +import pandas as pd +import sympy +from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin from sklearn.utils import check_array, check_consistent_length, check_random_state -from sklearn.utils.validation import ( - _check_feature_names_in, - check_is_fitted, -) +from sklearn.utils.validation import _check_feature_names_in, check_is_fitted +from sympy import sympify +from .deprecated import make_deprecated_kwargs_for_pysr_regressor +from .export_latex import generate_multiple_tables, generate_single_table, to_latex +from .export_numpy import CallableEquation from .julia_helpers import ( - init_julia, - _process_julia_project, - is_julia_version_greater_eq, _escape_filename, + _load_backend, _load_cluster_manager, + _process_julia_project, _update_julia_project, - _load_backend, + init_julia, + is_julia_version_greater_eq, ) -from .export_numpy import CallableEquation -from .export_latex import generate_single_table, generate_multiple_tables, to_latex -from .deprecated import make_deprecated_kwargs_for_pysr_regressor - Main = None # TODO: Rename to more descriptive name like "julia_runtime" @@ -2454,7 +2451,7 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int: def _denoise(X, y, Xresampled=None, random_state=None): """Denoise the dataset using a Gaussian process.""" from sklearn.gaussian_process import GaussianProcessRegressor - from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel + from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel() gpr = GaussianProcessRegressor( diff --git a/pysr/test/__init__.py b/pysr/test/__init__.py index 9fa94f52760678d41d44a15198adb6cdfe59ed22..7b176eab67718b614d8226ef6572871b97b2211a 100644 --- a/pysr/test/__init__.py +++ b/pysr/test/__init__.py @@ -1,5 +1,7 @@ from .test import runtests +from .test_cli import runtests as runtests_cli from .test_env import runtests as runtests_env from .test_jax import runtests as runtests_jax from .test_torch import runtests as runtests_torch -from .test_cli import runtests as runtests_cli + +__all__ = ["runtests", "runtests_env", "runtests_jax", "runtests_torch", "runtests_cli"] diff --git a/pysr/test/test.py b/pysr/test/test.py index b5b657e79ad6371e24831896dad10e2c51caf6b2..8d6937e406406c2fb9d759e13de1faf8e19dc602 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -1,28 +1,28 @@ +import inspect import os +import pickle as pkl +import tempfile import traceback -import inspect import unittest +import warnings +from pathlib import Path + import numpy as np +import pandas as pd +import sympy from sklearn import model_selection from sklearn.utils.estimator_checks import check_estimator -import sympy -import pandas as pd -import warnings -import pickle as pkl -import tempfile -from pathlib import Path -from .. import julia_helpers -from .. import PySRRegressor +from .. import PySRRegressor, julia_helpers +from ..export_latex import to_latex from ..sr import ( - run_feature_selection, - _handle_feature_selection, - _csv_filename_to_pkl_filename, - idx_model_selection, _check_assertions, + _csv_filename_to_pkl_filename, + _handle_feature_selection, _process_constraints, + idx_model_selection, + run_feature_selection, ) -from ..export_latex import to_latex DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default diff --git a/pysr/test/test_cli.py b/pysr/test/test_cli.py index a1270c82feb79e85146cc8714d2499d7a15f113b..0a97a1eb17d0bf7b4e097bff7ac417c4e60034f8 100644 --- a/pysr/test/test_cli.py +++ b/pysr/test/test_cli.py @@ -1,5 +1,7 @@ import unittest + from click import testing as click_testing + from .._cli.main import pysr diff --git a/pysr/test/test_env.py b/pysr/test/test_env.py index c044cfd0cd4247aebbfe111625b3e000611cc43c..423a3064ad792cacba5e66effb85546850003dc0 100644 --- a/pysr/test/test_env.py +++ b/pysr/test/test_env.py @@ -1,7 +1,7 @@ """Contains tests for creating and initializing custom Julia projects.""" -import unittest import os +import unittest from tempfile import TemporaryDirectory from .. import julia_helpers diff --git a/pysr/test/test_jax.py b/pysr/test/test_jax.py index e9834cadd4fe8d84c4bc69635a28740580bc4448..aaafb97cf6b3be04987c56df7f2c87c83baf3ae6 100644 --- a/pysr/test/test_jax.py +++ b/pysr/test/test_jax.py @@ -1,10 +1,11 @@ import unittest +from functools import partial + import numpy as np import pandas as pd import sympy -from functools import partial -from .. import sympy2jax, PySRRegressor +from .. import PySRRegressor, sympy2jax class TestJAX(unittest.TestCase): diff --git a/pysr/test/test_torch.py b/pysr/test/test_torch.py index 3520b28ee48e9bbebad0c98fa2a6b588db9fa031..5a71af71ff55decc488a86fb47cbe99bb90a231d 100644 --- a/pysr/test/test_torch.py +++ b/pysr/test/test_torch.py @@ -1,9 +1,11 @@ +import platform import unittest + import numpy as np import pandas as pd -import platform import sympy -from .. import sympy2torch, PySRRegressor + +from .. import PySRRegressor, sympy2torch # Need to initialize Julia before importing torch...