diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000000000000000000000000000000000000..7c346434dcba3f4d459a71228020e5e83e2e1f4e --- /dev/null +++ b/.coveragerc @@ -0,0 +1,4 @@ +[run] +omit = + */test/* +source = pysr diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index e141dae30ba9f0ddd6e2b175f79b78ede8735b32..9156e02eaf920bd15a02e78cbcec5e553a96d65d 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -24,6 +24,8 @@ jobs: test: runs-on: ${{ matrix.os }} timeout-minutes: 60 + env: + COVERAGE_PROCESS_START: "${{ github.workspace }}/.coveragerc" defaults: run: shell: bash @@ -38,6 +40,10 @@ jobs: python-version: '3.7' os: ubuntu-latest test-id: include + - julia-version: '1' + python-version: '3.12' + os: ubuntu-latest + test-id: include steps: - uses: actions/checkout@v4 @@ -58,29 +64,29 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install -r requirements.txt - python setup.py install - python -m pysr install - - name: "Install Coverage tool" - run: pip install coverage coveralls + pip install . + python -c 'import pysr' + - name: "Assert Julia version" + if: ${{ matrix.julia-version != '1'}} + run: python3 -c "from pysr import jl; assert jl.VERSION.major == jl.seval('v\"${{ matrix.julia-version }}\"').major; assert jl.VERSION.minor == jl.seval('v\"${{ matrix.julia-version }}\"').minor" + - name: "Install test dependencies" + run: pip install coverage coveralls pytest nbval + - name: "Set up coverage for subprocesses" + run: echo 'import coverage; coverage.process_startup()' > "${{ github.workspace }}/sitecustomize.py" - name: "Run tests" - run: | - coverage run --source=pysr --omit='*/test/*,*/feynman_problems.py' -m pysr.test main - coverage run --append --source=pysr --omit='*/test/*,*/feynman_problems.py' -m pysr.test cli + run: coverage run -m pysr test main,cli,startup - name: "Install JAX" run: pip install jax jaxlib # (optional import) if: ${{ matrix.test-id == 'main' }} - name: "Run JAX tests" - run: coverage run --append --source=pysr --omit='*/test/*,*/feynman_problems.py' -m pysr.test jax + run: coverage run --append -m pysr test jax if: ${{ matrix.test-id == 'main' }} - name: "Install Torch" run: pip install torch # (optional import) if: ${{ matrix.test-id == 'main' }} - name: "Run Torch tests" - run: coverage run --append --source=pysr --omit='*/test/*,*/feynman_problems.py' -m pysr.test torch + run: coverage run --append -m pysr test torch if: ${{ matrix.test-id == 'main' }} - - name: "Run custom env tests" - run: coverage run --append --source=pysr --omit='*/test/*,*/feynman_problems.py' -m pysr.test env - name: "Coveralls" env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -88,14 +94,26 @@ jobs: COVERALLS_PARALLEL: true run: coveralls --service=github - incremental_install: - runs-on: ubuntu-latest + dev_install: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: ['ubuntu-latest'] + python-version: ['3.11'] + julia-version: ['1'] + include: + - os: ubuntu-latest + python-version: '3.7' + julia-version: '1.6' steps: - uses: actions/checkout@v4 - - name: "Build incremental install" - run: docker build -t pysr -f pysr/test/incremental_install_simulator.dockerfile . - - name: "Test incremental install" - run: docker run --rm pysr /bin/bash -l -c 'python3 -m pysr.test main && python3 -m pysr.test env' + - uses: actions/setup-python@v5 + - name: "Install PySR" + run: | + python -m pip install --upgrade pip + pip install . + - name: "Run development test" + run: PYSR_TEST_JULIA_VERSION=${{ matrix.julia-version }} PYSR_TEST_PYTHON_VERSION=${{ matrix.python-version }} python -m pysr test dev conda_test: runs-on: ${{ matrix.os }} @@ -133,9 +151,9 @@ jobs: - name: "Install PySR" run: | python3 -m pip install . - python3 -m pysr install + python3 -c 'import pysr' - name: "Run tests" - run: cd /tmp && python -m pysr.test main + run: cd /tmp && python -m pysr test main coveralls: name: Indicate completion to coveralls.io @@ -177,9 +195,8 @@ jobs: - name: "Install PySR and all dependencies" run: | python -m pip install --upgrade pip - python -m pip install -r requirements.txt - python -m pip install mypy - python -m pip install . + pip install . + pip install mypy - name: "Install additional dependencies" run: python -m pip install jax jaxlib torch if: ${{ matrix.python-version != '3.7' }} diff --git a/.github/workflows/CI_Windows.yml b/.github/workflows/CI_Windows.yml index e6850c7750feb8525d26429dace945d502228d73..62771c45151788ba1a8fb466ffb34a504c2fe42f 100644 --- a/.github/workflows/CI_Windows.yml +++ b/.github/workflows/CI_Windows.yml @@ -52,16 +52,13 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install -r requirements.txt - python setup.py install - python -m pysr install + pip install pytest nbval + pip install . + python -c 'import pysr' - name: "Run tests" run: | - python -m pysr.test main - python -m pysr.test cli + python -m pysr test main,cli,startup - name: "Install Torch" run: pip install torch # (optional import) - name: "Run Torch tests" - run: python -m pysr.test torch - - name: "Run custom env tests" - run: python -m pysr.test env + run: python -m pysr test torch diff --git a/.github/workflows/CI_conda_forge.yml b/.github/workflows/CI_conda_forge.yml index 06960eb265e06f221f227258d80813465ed8d721..e6292b5e6c043491866539c86dc9014b4bc07f60 100644 --- a/.github/workflows/CI_conda_forge.yml +++ b/.github/workflows/CI_conda_forge.yml @@ -40,4 +40,6 @@ jobs: run: conda activate pysr-test && conda install pysr if: ${{ !matrix.use-mamba }} - name: "Run tests" - run: python -m pysr.test main + run: | + pip install pytest nbval + python -m pysr test main,startup diff --git a/.github/workflows/CI_docker.yml b/.github/workflows/CI_docker.yml index 2da0dc283b54e3e0efcbd553ebd01b0c911b7a40..80c3bc894285fb42b2a693b87c4004ab73b4eff2 100644 --- a/.github/workflows/CI_docker.yml +++ b/.github/workflows/CI_docker.yml @@ -37,4 +37,4 @@ jobs: - name: Build docker run: docker build --platform=${{ matrix.arch }} -t pysr . - name: Test docker - run: docker run --platform=${{ matrix.arch }} --rm pysr /bin/bash -c 'python3 -m pysr.test main && python3 -m pysr.test cli && python3 -m pysr.test env' + run: docker run --platform=${{ matrix.arch }} --rm pysr /bin/bash -c 'pip install pytest nbval && python3 -m pysr test main,cli,startup' diff --git a/.github/workflows/CI_docker_large_nightly.yml b/.github/workflows/CI_docker_large_nightly.yml index 98644a9658ec29526c340009190a6d809a666b8e..2077247e7a819ed580b86bc3c012e7c9419a3bbb 100644 --- a/.github/workflows/CI_docker_large_nightly.yml +++ b/.github/workflows/CI_docker_large_nightly.yml @@ -33,4 +33,4 @@ jobs: - name: Build docker run: docker build --platform=${{ matrix.arch }} -t pysr --build-arg JLVERSION=${{ matrix.julia-version }} --build-arg PYVERSION=${{ matrix.python-version }} . - name: Test docker - run: docker run --platform=${{ matrix.arch }} --rm pysr /bin/bash -c 'python3 -m pysr.test main && python3 -m pysr.test cli && python3 -m pysr.test env' + run: docker run --platform=${{ matrix.arch }} --rm pysr /bin/bash -c 'pip install pytest nbval && python3 -m pysr test main,cli,startup' diff --git a/.github/workflows/CI_large_nightly.yml b/.github/workflows/CI_large_nightly.yml index da314c29bd1af21a55aa0495527408f0e25faea8..fd68df0b7b742cdd976eb2a243814e0dcdb83c2a 100644 --- a/.github/workflows/CI_large_nightly.yml +++ b/.github/workflows/CI_large_nightly.yml @@ -40,13 +40,11 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install -r requirements.txt - python setup.py install - python -m pysr install + pip install pytest nbval + pip install . + python -c 'import pysr' + - name: "Assert Julia version" + if: ${{ matrix.julia-version != '1'}} + run: python3 -c "from pysr import jl; assert jl.VERSION.major == jl.seval('v\"${{ matrix.julia-version }}\"').major; assert jl.VERSION.minor == jl.seval('v\"${{ matrix.julia-version }}\"').minor" - name: "Run tests" - run: | - python -m pysr.test main - python -m pysr.test cli - - name: "Run new env test" - run: python -m pysr.test env - if: ${{ !(matrix.os == 'windows-latest' && matrix.python-version == '3.7') }} + run: python -m pysr test main,cli,startup diff --git a/.github/workflows/CI_mac.yml b/.github/workflows/CI_mac.yml index cb5be3a46abbc20550def29864a16a6571291258..30004fbd7c22d6ed4d6d7c29e7ab94ed770a5b10 100644 --- a/.github/workflows/CI_mac.yml +++ b/.github/workflows/CI_mac.yml @@ -52,20 +52,17 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install -r requirements.txt - python setup.py install - python -m pysr install + pip install pytest nbval + pip install . + python -c 'import pysr' - name: "Run tests" run: | - python -m pysr.test main - python -m pysr.test cli + python -m pysr test main,cli,startup - name: "Install JAX" run: pip install jax jaxlib # (optional import) - name: "Run JAX tests" - run: python -m pysr.test jax + run: python -m pysr test jax - name: "Install Torch" run: pip install torch # (optional import) - name: "Run Torch tests" - run: python -m pysr.test torch - - name: "Run custom env tests" - run: python -m pysr.test env + run: python -m pysr test torch diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 33137562e28810ac7b2bde89e51371b395b32acf..95a9578f26d5498ea91470dbdfd3abe798d2be0d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,11 +13,13 @@ repos: hooks: - id: black - id: black-jupyter + exclude: pysr/test/test_nb.ipynb # Stripping notebooks - repo: https://github.com/kynan/nbstripout rev: 0.6.1 hooks: - id: nbstripout + exclude: pysr/test/test_nb.ipynb # Unused imports - repo: https://github.com/hadialqattan/pycln rev: "v2.4.0" diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f1695ea758c184e7146c95eb10ef7e2657187d74..0a4e4f0f1405a295a33892299f2e7e77b21bb72e 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -42,7 +42,7 @@ Scan through our [existing issues](https://github.com/MilesCranmer/PySR/issues) check out the [guide](https://astroautomata.com/PySR/backend/) on modifying a custom SymbolicRegression.jl library. In this case, you might instead be interested in making suggestions to the [SymbolicRegression.jl](http://github.com/MilesCranmer/SymbolicRegression.jl) library. -4. You can install your local version of PySR with `python setup.py install`, and run tests with `python -m pysr.test main`. +4. You can install your local version of PySR with `python setup.py install`, and run tests with `python -m pysr test main`. ### Commit your update diff --git a/Dockerfile b/Dockerfile index 86824e8d73e9af93470859b088515c0063eaf61e..8b87b92561a84f30d309e70d08c66d1ceec6935f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,22 +13,23 @@ COPY --from=jl /usr/local/julia /usr/local/julia ENV PATH="/usr/local/julia/bin:${PATH}" # Install IPython and other useful libraries: -RUN pip install ipython matplotlib +RUN pip install --no-cache-dir ipython matplotlib WORKDIR /pysr # Caches install (https://stackoverflow.com/questions/25305788/how-to-avoid-reinstalling-packages-when-building-docker-image-for-python-project) ADD ./requirements.txt /pysr/requirements.txt -RUN pip3 install -r /pysr/requirements.txt +RUN pip3 install --no-cache-dir -r /pysr/requirements.txt # Install PySR: # We do a minimal copy so it doesn't need to rerun at every file change: +ADD ./pyproject.toml /pysr/pyproject.toml ADD ./setup.py /pysr/setup.py -ADD ./pysr/ /pysr/pysr/ -RUN pip3 install . +ADD ./pysr /pysr/pysr +RUN pip3 install --no-cache-dir . # Install Julia pre-requisites: -RUN python3 -m pysr install +RUN python3 -c 'import pysr' # metainformation LABEL org.opencontainers.image.authors = "Miles Cranmer" diff --git a/README.md b/README.md index d6f541a12ebc41cf178c89be4aa2d76f1c244f71..35772f487310c378c6a74994afa54270c0d4f3ac 100644 --- a/README.md +++ b/README.md @@ -27,10 +27,6 @@ If you've finished a project with PySR, please submit a PR to showcase your work - [Contributors](#contributors-) - [Why PySR?](#why-pysr) - [Installation](#installation) - - [pip](#pip) - - [conda](#conda) - - [docker](#docker-build) - - [Troubleshooting](#troubleshooting) - [Quickstart](#quickstart) - [→ Documentation](https://astroautomata.com/PySR) @@ -129,48 +125,31 @@ an explicit and powerful way to interpret deep neural networks. ## Installation -| [pip](#pip) | [conda](#conda) | [docker](#docker-build) | -|:---:|:---:|:---:| -| Everywhere (recommended) | Linux and Intel-based macOS | Everywhere (if all else fails) | +### Pip ---- - -### pip +You can install PySR with pip: -1. [Install Julia](https://julialang.org/downloads/) - - Alternatively, my personal preference is to use [juliaup](https://github.com/JuliaLang/juliaup#installation), which performs this automatically. -2. Then, run: ```bash -pip3 install -U pysr +pip install pysr ``` -3. Finally, to install Julia dependencies: -```bash -python3 -m pysr install -``` -> (Alternatively, from within Python, you can call `import pysr; pysr.install()`) ---- +Julia dependencies will be installed at first import. -### conda +### Conda -The PySR build in conda includes all required dependencies, so you can install it by simply running: +Similarly, with conda: ```bash conda install -c conda-forge pysr ``` -from within your target conda environment. -However, note that the conda install does not support precompilation of Julia libraries, so the -start time may be slightly slower as the JIT-compilation will be running. -(Once the compilation finishes, there will not be a performance difference though.) - ---- +### Docker -### docker build +You can also use the `Dockerfile` to install PySR in a docker container 1. Clone this repo. -2. In the repo, run the build command with: +2. Within the repo's directory, build the docker container: ```bash docker build -t pysr . ``` @@ -185,11 +164,7 @@ For more details, see the [docker section](#docker). ### Troubleshooting -Common issues tend to be related to Python not finding Julia. -To debug this, try running `python3 -c 'import os; print(os.environ["PATH"])'`. -If none of these folders contain your Julia binary, then you need to add Julia's `bin` folder to your `PATH` environment variable. - -Another issue you might run into can result in a hard crash at import with +One issue you might run into can result in a hard crash at import with a message like "`GLIBCXX_...` not found". This is due to another one of the Python dependencies loading an incorrect `libstdc++` library. To fix this, you should modify your `LD_LIBRARY_PATH` variable to reference the Julia libraries. For example, if the Julia @@ -202,7 +177,6 @@ export LD_LIBRARY_PATH=$HOME/.julia/juliaup/julia-1.10.0+0.x64.linux.gnu/lib/jul to your `.bashrc` or `.zshrc` file. -**Running PySR on macOS with an M1 processor:** you should use the pip version, and make sure to get the Julia binary for ARM/M-series processors. ## Quickstart @@ -240,7 +214,7 @@ model = PySRRegressor( ], extra_sympy_mappings={"inv": lambda x: 1 / x}, # ^ Define operator for SymPy as well - loss="loss(prediction, target) = (prediction - target)^2", + elementwise_loss="loss(prediction, target) = (prediction - target)^2", # ^ Custom loss function (julia syntax) ) ``` @@ -323,7 +297,7 @@ model = PySRRegressor( # ^ 2 populations per core, so one is always running. population_size=50, # ^ Slightly larger populations, for greater diversity. - ncyclesperiteration=500, + ncycles_per_iteration=500, # ^ Generations between migrations. niterations=10000000, # Run forever early_stop_condition=( diff --git a/TODO.md b/TODO.md deleted file mode 100644 index 0ced06bbfdae69cfdf0ac9a709b5cf5dd7c65a95..0000000000000000000000000000000000000000 --- a/TODO.md +++ /dev/null @@ -1,142 +0,0 @@ -# TODO - -- [x] Async threading, and have a server of equations. So that threads aren't waiting for others to finish. -- [x] Print out speed of equation evaluation over time. Measure time it takes per cycle -- [x] Add ability to pass an operator as an anonymous function string. E.g., `binary_operators=["g(x, y) = x+y"]`. -- [x] Add error bar capability (thanks Johannes Buchner for suggestion) -- [x] Why don't the constants continually change? It should optimize them every time the equation appears. - - Restart the optimizer to help with this. -- [x] Add several common unary and binary operators; list these. -- [x] Try other initial conditions for optimizer -- [x] Make scaling of changes to constant a hyperparameter -- [x] Make deletion op join deleted subtree to parent -- [x] Update hall of fame every iteration? - - Seems to overfit early if we do this. -- [x] Consider adding mutation to pass an operator in through a new binary operator (e.g., exp(x3)->plus(exp(x3), ...)) - - (Added full insertion operator -- [x] Add a node at the top of a tree -- [x] Insert a node at the top of a subtree -- [x] Record very best individual in each population, and return at end. -- [x] Write our own tree copy operation; deepcopy() is the slowest operation by far. -- [x] Hyperparameter tune -- [x] Create a benchmark for accuracy -- [x] Add interface for either defining an operation to learn, or loading in arbitrary dataset. - - Could just write out the dataset in julia, or load it. -- [x] Create a Python interface -- [x] Explicit constant optimization on hall-of-fame - - Create method to find and return all constants, from left to right - - Create method to find and set all constants, in same order - - Pull up some optimization algorithm and add it. Keep the package small! -- [x] Create a benchmark for speed -- [x] Simplify subtrees with only constants beneath them. Or should I? Maybe randomly simplify sometimes? -- [x] Record hall of fame -- [x] Optionally (with hyperparameter) migrate the hall of fame, rather than current bests -- [x] Test performance of reduced precision integers - - No effect -- [x] Create struct to pass through all hyperparameters, instead of treating as constants - - Make sure doesn't affect performance -- [x] Rename package to avoid trademark issues - - PySR? -- [x] Put on PyPI -- [x] Treat baseline as a solution. -- [x] Print score alongside MSE: \delta \log(MSE)/\delta \log(complexity) -- [x] Calculating the loss function - there is duplicate calculations happening. -- [x] Declaration of the weights array every iteration -- [x] Sympy evaluation -- [x] Threaded recursion -- [x] Test suite -- [x] Performance: - Use an enum for functions instead of storing them? - - Gets ~40% speedup on small test. -- [x] Use @fastmath -- [x] Try @spawn over each sub-population. Do random sort, compute mutation for each, then replace 10% oldest. -- [x] Control max depth, rather than max number of nodes? -- [x] Allow user to pass names for variables - use these when printing -- [x] Check for domain errors in an equation quickly before actually running the entire array over it. (We do this now recursively - every single equation is checked for nans/infs when being computed.) -- [x] read the docs page -- [x] Create backup csv file so always something to copy from for `PySR`. Also use random hall of fame file by default. Call function to read from csv after running, so dont need to run again. Dump scores alongside MSE to .csv (and return with Pandas). -- [x] Better cleanup of zombie processes after -- [x] Consider printing output sorted by score, not by complexity. -- [x] Increase max complexity slowly over time up to the actual max. -- [x] Record density over complexity. Favor equations that have a density we have not explored yet. Want the final density to be evenly distributed. -- [x] Do printing from Python side. Then we can do simplification and pretty-printing. -- [x] Sympy printing -- [x] Store Project.toml inside PySR's python code, rather than copied to site-packages. -- [ ] Sort these todo lists by priority - -- [ ] Automatically convert log, log10, log2, pow to the correct operators. -- [ ] I think the simplification isn't working correctly (post-merging SymbolicUtils.) -- [ ] Show demo of PySRRegressor. Fit equations, then show how to view equations. -- [ ] Add "selected" column string to regular equations dict. -- [ ] List "Loss" instead of "MSE" - -## Feature ideas - -- [ ] Other default losses (e.g., abs, other likelihoods, or just allow user to pass this as a string). -- [ ] Other dtypes available -- [ ] NDSA-II -- [ ] Cross-validation -- [ ] Hierarchical model, so can re-use functional forms. Output of one equation goes into second equation? -- [ ] Add function to plot equations -- [ ] Refresh screen rather than dumping to stdout? -- [ ] Add ability to save state from python -- [ ] Additional degree operators? -- [ ] Multi targets (vector ops). Idea 1: Node struct contains argument for which registers it is applied to. Then, can work with multiple components simultaneously. Though this may be tricky to get right. Idea 2: each op is defined by input/output space. Some operators are flexible, and the spaces should be adjusted automatically. Otherwise, only consider ops that make a tree possible. But will need additional ops here to get it to work. Idea 3: define each equation in 2 parts: one part that is shared between all outputs, and one that is different between all outputs. Maybe this could be an array of nodes corresponding to each output. And those nodes would define their functions. - - Much easier option: simply flatten the output vector, and set the index as another input feature. The equation learned will be a single equation containing indices as a feature. -- [ ] Tree crossover? I.e., can take as input a part of the same equation, so long as it is the same level or below? -- [ ] Create flexible way of providing "simplification recipes." I.e., plus(plus(T, C), C) => plus(T, +(C, C)). The user could pass these. -- [ ] Consider allowing multi-threading turned off, for faster testing (cache issue on travis). Or could simply fix the caching issue there. -- [ ] Consider returning only the equation of interest; rather than all equations. -- [ ] Enable derivative operators. These would differentiate their right argument wrt their left argument, some input variable. - -## Algorithmic performance ideas: - - -- [ ] Use package compiler and compile sr.jl into a standalone binary that can be used by pysr. -- [ ] When doing equation warmup, only migrate those equations with almost the same complexity. Rather than having to consider simple equations later in the game. -- [ ] Right now we only update the score based on some. Need to update score based on entire data! Note that optimizer only is used sometimes. -- [ ] Idea: use gradient of equation with respect to each operator (perhaps simply add to each operator) to tell which part is the most "sensitive" to changes. Then, perhaps insert/delete/mutate on that part of the tree? -- [ ] Start populations staggered; so that there is more frequent printing (and pops that start a bit later get hall of fame already)? -- [ ] Consider adding mutation for constant<->variable -- [ ] Implement more parts of the original Eureqa algorithms: https://www.creativemachineslab.com/eureqa.html -- [ ] Experiment with freezing parts of model; then we only append/delete at end of tree. -- [ ] Use NN to generate weights over all probability distribution conditional on error and existing equation, and train on some randomly-generated equations -- [ ] For hierarchical idea: after running some number of iterations, do a search for "most common pattern". Then, turn that subtree into its own operator. -- [ ] Calculate feature importances based on features we've already seen, then weight those features up in all random generations. -- [ ] Calculate feature importances of future mutations, by looking at correlation between residual of model, and the features. - - Store feature importances of future, and periodically update it. -- [ ] Punish depth rather than size, as depth really hurts during optimization. - - -## Code performance ideas: - -- [ ] How hard is it to turn the recursive array evaluation into a for loop? -- [ ] Try defining a binary tree as an array, rather than a linked list. See https://stackoverflow.com/a/6384714/2689923 - - in array branch -- [ ] Add true multi-node processing, with MPI, or just file sharing. Multiple populations per core. - - Ongoing in cluster branch -- [ ] Performance: try inling things? -- [ ] Try storing things like number nodes in a tree; then can iterate instead of counting - -```julia -mutable struct Tree - degree::Array{Integer, 1} - val::Array{Float32, 1} - constant::Array{Bool, 1} - op::Array{Integer, 1} - Tree(s::Integer) = new(zeros(Integer, s), zeros(Float32, s), zeros(Bool, s), zeros(Integer, s)) -end -``` - -- Then, we could even work with trees on the GPU, since they are all pre-allocated arrays. -- A population could be a Tree, but with degree 2 on all the degrees. So a slice of population arrays forms a tree. -- How many operations can we do via matrix ops? Mutate node=>easy. -- Can probably batch and do many operations at once across a population. - - Or, across all populations! Mutate operator: index 2D array and set it to random vector? But the indexing might hurt. -- The big advantage: can evaluate all new mutated trees at once; as massive matrix operation. - - Can control depth, rather than maxsize. Then just pretend all trees are full and same depth. Then we really don't need to care about depth. - -- [ ] Can we cache calculations, or does the compiler do that? E.g., I should only have to run exp(x0) once; after that it should be read from memory. - - Done on caching branch. Currently am finding that this is quiet slow (presumably because memory allocation is the main issue). -- [ ] Add GPU capability? - - Not sure if possible, as binary trees are the real bottleneck. - - Could generate on CPU, evaluate score on GPU? diff --git a/datasets/FeynmanEquations.csv b/datasets/FeynmanEquations.csv deleted file mode 100644 index bd80cfba0b4bf59d6b99bb98fbc6445101f6ce48..0000000000000000000000000000000000000000 --- a/datasets/FeynmanEquations.csv +++ /dev/null @@ -1,101 +0,0 @@ -Filename,datapoints,Number,Output,Formula,# variables,v1_name,v1_low,v1_high,v2_name,v2_low,v2_high,v3_name,v3_low,v3_high,v4_name,v4_low,v4_high,v5_name,v5_low,v5_high,v6_name,v6_low,v6_high,v7_name,v7_low,v7_high,v8_name,v8_low,v8_high,v9_name,v9_low,v9_high,v10_name,v10_low,v10_high -I.6.2a,10,1,f,exp(-theta**2/2)/sqrt(2*pi),1,theta,1,3,,,,,,,,,,,,,,,,,,,,,,,,,,, -I.6.2,100,2,f,exp(-(theta/sigma)**2/2)/(sqrt(2*pi)*sigma),2,sigma,1,3,theta,1,3,,,,,,,,,,,,,,,,,,,,,,,, -I.6.2b,1000,3,f,exp(-((theta-theta1)/sigma)**2/2)/(sqrt(2*pi)*sigma),3,sigma,1,3,theta,1,3,theta1,1,3,,,,,,,,,,,,,,,,,,,,, -I.8.14,100,4,d,sqrt((x2-x1)**2+(y2-y1)**2),4,x1,1,5,x2,1,5,y1,1,5,y2,1,5,,,,,,,,,,,,,,,,,, -I.9.18,1000000,5,F,G*m1*m2/((x2-x1)**2+(y2-y1)**2+(z2-z1)**2),9,m1,1,2,m2,1,2,G,1,2,x1,3,4,x2,1,2,y1,3,4,y2,1,2,z1,3,4,z2,1,2,,, -I.10.7,10,6,m,m_0/sqrt(1-v**2/c**2),3,m_0,1,5,v,1,2,c,3,10,,,,,,,,,,,,,,,,,,,,, -I.11.19,100,7,A,x1*y1+x2*y2+x3*y3,6,x1,1,5,x2,1,5,x3,1,5,y1,1,5,y2,1,5,y3,1,5,,,,,,,,,,,, -I.12.1,10,8,F,mu*Nn,2,mu,1,5,Nn,1,5,,,,,,,,,,,,,,,,,,,,,,,, -I.12.2,10,10,F,q1*q2*r/(4*pi*epsilon*r**3),4,q1,1,5,q2,1,5,epsilon,1,5,r,1,5,,,,,,,,,,,,,,,,,, -I.12.4,10,11,Ef,q1*r/(4*pi*epsilon*r**3),3,q1,1,5,epsilon,1,5,r,1,5,,,,,,,,,,,,,,,,,,,,, -I.12.5,10,12,F,q2*Ef,2,q2,1,5,Ef,1,5,,,,,,,,,,,,,,,,,,,,,,,, -I.12.11,10,13,F,q*(Ef+B*v*sin(theta)),5,q,1,5,Ef,1,5,B,1,5,v,1,5,theta,1,5,,,,,,,,,,,,,,, -I.13.4,10,9,K,1/2*m*(v**2+u**2+w**2),4,m,1,5,v,1,5,u,1,5,w,1,5,,,,,,,,,,,,,,,,,, -I.13.12,10,14,U,G*m1*m2*(1/r2-1/r1),5,m1,1,5,m2,1,5,r1,1,5,r2,1,5,G,1,5,,,,,,,,,,,,,,, -I.14.3,10,15,U,m*g*z,3,m,1,5,g,1,5,z,1,5,,,,,,,,,,,,,,,,,,,,, -I.14.4,10,16,U,1/2*k_spring*x**2,2,k_spring,1,5,x,1,5,,,,,,,,,,,,,,,,,,,,,,,, -I.15.3x,10,17,x1,(x-u*t)/sqrt(1-u**2/c**2),4,x,5,10,u,1,2,c,3,20,t,1,2,,,,,,,,,,,,,,,,,, -I.15.3t,100,18,t1,(t-u*x/c**2)/sqrt(1-u**2/c**2),4,x,1,5,c,3,10,u,1,2,t,1,5,,,,,,,,,,,,,,,,,, -I.15.1,10,19,p,m_0*v/sqrt(1-v**2/c**2),3,m_0,1,5,v,1,2,c,3,10,,,,,,,,,,,,,,,,,,,,, -I.16.6,10,20,v1,(u+v)/(1+u*v/c**2),3,c,1,5,v,1,5,u,1,5,,,,,,,,,,,,,,,,,,,,, -I.18.4,10,21,r,(m1*r1+m2*r2)/(m1+m2),4,m1,1,5,m2,1,5,r1,1,5,r2,1,5,,,,,,,,,,,,,,,,,, -I.18.12,10,22,tau,r*F*sin(theta),3,r,1,5,F,1,5,theta,0,5,,,,,,,,,,,,,,,,,,,,, -I.18.14,10,23,L,m*r*v*sin(theta),4,m,1,5,r,1,5,v,1,5,theta,1,5,,,,,,,,,,,,,,,,,, -I.24.6,10,24,E_n,1/2*m*(omega**2+omega_0**2)*1/2*x**2,4,m,1,3,omega,1,3,omega_0,1,3,x,1,3,,,,,,,,,,,,,,,,,, -I.25.13,10,25,Volt,q/C,2,q,1,5,C,1,5,,,,,,,,,,,,,,,,,,,,,,,, -I.26.2,100,26,theta1,arcsin(n*sin(theta2)),2,n,0,1,theta2,1,5,,,,,,,,,,,,,,,,,,,,,,,, -I.27.6,10,27,foc,1/(1/d1+n/d2),3,d1,1,5,d2,1,5,n,1,5,,,,,,,,,,,,,,,,,,,,, -I.29.4,10,28,k,omega/c,2,omega,1,10,c,1,10,,,,,,,,,,,,,,,,,,,,,,,, -I.29.16,1000,29,x,sqrt(x1**2+x2**2-2*x1*x2*cos(theta1-theta2)),4,x1,1,5,x2,1,5,theta1,1,5,theta2,1,5,,,,,,,,,,,,,,,,,, -I.30.3,100,30,Int,Int_0*sin(n*theta/2)**2/sin(theta/2)**2,3,Int_0,1,5,theta,1,5,n,1,5,,,,,,,,,,,,,,,,,,,,, -I.30.5,100,31,theta,arcsin(lambd/(n*d)),3,lambd,1,2,d,2,5,n,1,5,,,,,,,,,,,,,,,,,,,,, -I.32.5,10,32,Pwr,q**2*a**2/(6*pi*epsilon*c**3),4,q,1,5,a,1,5,epsilon,1,5,c,1,5,,,,,,,,,,,,,,,,,, -I.32.17,10,33,Pwr,(1/2*epsilon*c*Ef**2)*(8*pi*r**2/3)*(omega**4/(omega**2-omega_0**2)**2),6,epsilon,1,2,c,1,2,Ef,1,2,r,1,2,omega,1,2,omega_0,3,5,,,,,,,,,,,, -I.34.8,10,34,omega,q*v*B/p,4,q,1,5,v,1,5,B,1,5,p,1,5,,,,,,,,,,,,,,,,,, -I.34.1,10,35,omega,omega_0/(1-v/c),3,c,3,10,v,1,2,omega_0,1,5,,,,,,,,,,,,,,,,,,,,, -I.34.14,10,36,omega,(1+v/c)/sqrt(1-v**2/c**2)*omega_0,3,c,3,10,v,1,2,omega_0,1,5,,,,,,,,,,,,,,,,,,,,, -I.34.27,10,37,E_n,(h/(2*pi))*omega,2,omega,1,5,h,1,5,,,,,,,,,,,,,,,,,,,,,,,, -I.37.4,100,38,Int,I1+I2+2*sqrt(I1*I2)*cos(delta),3,I1,1,5,I2,1,5,delta,1,5,,,,,,,,,,,,,,,,,,,,, -I.38.12,10,39,r,4*pi*epsilon*(h/(2*pi))**2/(m*q**2),4,m,1,5,q,1,5,h,1,5,epsilon,1,5,,,,,,,,,,,,,,,,,, -I.39.1,10,40,E_n,3/2*pr*V,2,pr,1,5,V,1,5,,,,,,,,,,,,,,,,,,,,,,,, -I.39.11,10,41,E_n,1/(gamma-1)*pr*V,3,gamma,2,5,pr,1,5,V,1,5,,,,,,,,,,,,,,,,,,,,, -I.39.22,10,42,pr,n*kb*T/V,4,n,1,5,T,1,5,V,1,5,kb,1,5,,,,,,,,,,,,,,,,,, -I.40.1,10,43,n,n_0*exp(-m*g*x/(kb*T)),6,n_0,1,5,m,1,5,x,1,5,T,1,5,g,1,5,kb,1,5,,,,,,,,,,,, -I.41.16,10,44,L_rad,h/(2*pi)*omega**3/(pi**2*c**2*(exp((h/(2*pi))*omega/(kb*T))-1)),5,omega,1,5,T,1,5,h,1,5,kb,1,5,c,1,5,,,,,,,,,,,,,,, -I.43.16,10,45,v,mu_drift*q*Volt/d,4,mu_drift,1,5,q,1,5,Volt,1,5,d,1,5,,,,,,,,,,,,,,,,,, -I.43.31,10,46,D,mob*kb*T,3,mob,1,5,T,1,5,kb,1,5,,,,,,,,,,,,,,,,,,,,, -I.43.43,10,47,kappa,1/(gamma-1)*kb*v/A,4,gamma,2,5,kb,1,5,A,1,5,v,1,5,,,,,,,,,,,,,,,,,, -I.44.4,10,48,E_n,n*kb*T*ln(V2/V1),5,n,1,5,kb,1,5,T,1,5,V1,1,5,V2,1,5,,,,,,,,,,,,,,, -I.47.23,10,49,c,sqrt(gamma*pr/rho),3,gamma,1,5,pr,1,5,rho,1,5,,,,,,,,,,,,,,,,,,,,, -I.48.2,100,50,E_n,m*c**2/sqrt(1-v**2/c**2),3,m,1,5,v,1,2,c,3,10,,,,,,,,,,,,,,,,,,,,, -I.50.26,10,51,x,x1*(cos(omega*t)+alpha*cos(omega*t)**2),4,x1,1,3,omega,1,3,t,1,3,alpha,1,3,,,,,,,,,,,,,,,,,, -II.2.42,10,52,Pwr,kappa*(T2-T1)*A/d,5,kappa,1,5,T1,1,5,T2,1,5,A,1,5,d,1,5,,,,,,,,,,,,,,, -II.3.24,10,53,flux,Pwr/(4*pi*r**2),2,Pwr,1,5,r,1,5,,,,,,,,,,,,,,,,,,,,,,,, -II.4.23,10,54,Volt,q/(4*pi*epsilon*r),3,q,1,5,epsilon,1,5,r,1,5,,,,,,,,,,,,,,,,,,,,, -II.6.11,10,55,Volt,1/(4*pi*epsilon)*p_d*cos(theta)/r**2,4,epsilon,1,3,p_d,1,3,theta,1,3,r,1,3,,,,,,,,,,,,,,,,,, -II.6.15a,1000,56,Ef,p_d/(4*pi*epsilon)*3*z/r**5*sqrt(x**2+y**2),6,epsilon,1,3,p_d,1,3,r,1,3,x,1,3,y,1,3,z,1,3,,,,,,,,,,,, -II.6.15b,10,57,Ef,p_d/(4*pi*epsilon)*3*cos(theta)*sin(theta)/r**3,4,epsilon,1,3,p_d,1,3,theta,1,3,r,1,3,,,,,,,,,,,,,,,,,, -II.8.7,10,58,E_n,3/5*q**2/(4*pi*epsilon*d),3,q,1,5,epsilon,1,5,d,1,5,,,,,,,,,,,,,,,,,,,,, -II.8.31,10,59,E_den,epsilon*Ef**2/2,2,epsilon,1,5,Ef,1,5,,,,,,,,,,,,,,,,,,,,,,,, -II.10.9,10,60,Ef,sigma_den/epsilon*1/(1+chi),3,sigma_den,1,5,epsilon,1,5,chi,1,5,,,,,,,,,,,,,,,,,,,,, -II.11.3,10,61,x,q*Ef/(m*(omega_0**2-omega**2)),5,q,1,3,Ef,1,3,m,1,3,omega_0,3,5,omega,1,2,,,,,,,,,,,,,,, -II.11.17,10,62,n,n_0*(1+p_d*Ef*cos(theta)/(kb*T)),6,n_0,1,3,kb,1,3,T,1,3,theta,1,3,p_d,1,3,Ef,1,3,,,,,,,,,,,, -II.11.20,10,63,Pol,n_rho*p_d**2*Ef/(3*kb*T),5,n_rho,1,5,p_d,1,5,Ef,1,5,kb,1,5,T,1,5,,,,,,,,,,,,,,, -II.11.27,100,64,Pol,n*alpha/(1-(n*alpha/3))*epsilon*Ef,4,n,0,1,alpha,0,1,epsilon,1,2,Ef,1,2,,,,,,,,,,,,,,,,,, -II.11.28,100,65,theta,1+n*alpha/(1-(n*alpha/3)),2,n,0,1,alpha,0,1,,,,,,,,,,,,,,,,,,,,,,,, -II.13.17,10,66,B,1/(4*pi*epsilon*c**2)*2*I/r,4,epsilon,1,5,c,1,5,I,1,5,r,1,5,,,,,,,,,,,,,,,,,, -II.13.23,100,67,rho_c,rho_c_0/sqrt(1-v**2/c**2),3,rho_c_0,1,5,v,1,2,c,3,10,,,,,,,,,,,,,,,,,,,,, -II.13.34,10,68,j,rho_c_0*v/sqrt(1-v**2/c**2),3,rho_c_0,1,5,v,1,2,c,3,10,,,,,,,,,,,,,,,,,,,,, -II.15.4,10,69,E_n,-mom*B*cos(theta),3,mom,1,5,B,1,5,theta,1,5,,,,,,,,,,,,,,,,,,,,, -II.15.5,10,70,E_n,-p_d*Ef*cos(theta),3,p_d,1,5,Ef,1,5,theta,1,5,,,,,,,,,,,,,,,,,,,,, -II.21.32,10,71,Volt,q/(4*pi*epsilon*r*(1-v/c)),5,q,1,5,epsilon,1,5,r,1,5,v,1,2,c,3,10,,,,,,,,,,,,,,, -II.24.17,10,72,k,sqrt(omega**2/c**2-pi**2/d**2),3,omega,4,6,c,1,2,d,2,4,,,,,,,,,,,,,,,,,,,,, -II.27.16,10,73,flux,epsilon*c*Ef**2,3,epsilon,1,5,c,1,5,Ef,1,5,,,,,,,,,,,,,,,,,,,,, -II.27.18,10,74,E_den,epsilon*Ef**2,2,epsilon,1,5,Ef,1,5,,,,,,,,,,,,,,,,,,,,,,,, -II.34.2a,10,75,I,q*v/(2*pi*r),3,q,1,5,v,1,5,r,1,5,,,,,,,,,,,,,,,,,,,,, -II.34.2,10,76,mom,q*v*r/2,3,q,1,5,v,1,5,r,1,5,,,,,,,,,,,,,,,,,,,,, -II.34.11,10,77,omega,g_*q*B/(2*m),4,g_,1,5,q,1,5,B,1,5,m,1,5,,,,,,,,,,,,,,,,,, -II.34.29a,10,78,mom,q*h/(4*pi*m),3,q,1,5,h,1,5,m,1,5,,,,,,,,,,,,,,,,,,,,, -II.34.29b,10,79,E_n,g_*mom*B*Jz/(h/(2*pi)),5,g_,1,5,h,1,5,Jz,1,5,mom,1,5,B,1,5,,,,,,,,,,,,,,, -II.35.18,10,80,n,n_0/(exp(mom*B/(kb*T))+exp(-mom*B/(kb*T))),5,n_0,1,3,kb,1,3,T,1,3,mom,1,3,B,1,3,,,,,,,,,,,,,,, -II.35.21,10,81,M,n_rho*mom*tanh(mom*B/(kb*T)),5,n_rho,1,5,mom,1,5,B,1,5,kb,1,5,T,1,5,,,,,,,,,,,,,,, -II.36.38,10,82,f,mom*H/(kb*T)+(mom*alpha)/(epsilon*c**2*kb*T)*M,8,mom,1,3,H,1,3,kb,1,3,T,1,3,alpha,1,3,epsilon,1,3,c,1,3,M,1,3,,,,,, -II.37.1,10,83,E_n,mom*(1+chi)*B,3,mom,1,5,B,1,5,chi,1,5,,,,,,,,,,,,,,,,,,,,, -II.38.3,10,84,F,Y*A*x/d,4,Y,1,5,A,1,5,d,1,5,x,1,5,,,,,,,,,,,,,,,,,, -II.38.14,10,85,mu_S,Y/(2*(1+sigma)),2,Y,1,5,sigma,1,5,,,,,,,,,,,,,,,,,,,,,,,, -III.4.32,10,86,n,1/(exp((h/(2*pi))*omega/(kb*T))-1),4,h,1,5,omega,1,5,kb,1,5,T,1,5,,,,,,,,,,,,,,,,,, -III.4.33,10,87,E_n,(h/(2*pi))*omega/(exp((h/(2*pi))*omega/(kb*T))-1),4,h,1,5,omega,1,5,kb,1,5,T,1,5,,,,,,,,,,,,,,,,,, -III.7.38,10,88,omega,2*mom*B/(h/(2*pi)),3,mom,1,5,B,1,5,h,1,5,,,,,,,,,,,,,,,,,,,,, -III.8.54,10,89,prob,sin(E_n*t/(h/(2*pi)))**2,3,E_n,1,2,t,1,2,h,1,4,,,,,,,,,,,,,,,,,,,,, -III.9.52,1000,90,prob,(p_d*Ef*t/(h/(2*pi)))*sin((omega-omega_0)*t/2)**2/((omega-omega_0)*t/2)**2,6,p_d,1,3,Ef,1,3,t,1,3,h,1,3,omega,1,5,omega_0,1,5,,,,,,,,,,,, -III.10.19,100,91,E_n,mom*sqrt(Bx**2+By**2+Bz**2),4,mom,1,5,Bx,1,5,By,1,5,Bz,1,5,,,,,,,,,,,,,,,,,, -III.12.43,10,92,L,n*(h/(2*pi)),2,n,1,5,h,1,5,,,,,,,,,,,,,,,,,,,,,,,, -III.13.18,10,93,v,2*E_n*d**2*k/(h/(2*pi)),4,E_n,1,5,d,1,5,k,1,5,h,1,5,,,,,,,,,,,,,,,,,, -III.14.14,10,94,I,I_0*(exp(q*Volt/(kb*T))-1),5,I_0,1,5,q,1,2,Volt,1,2,kb,1,2,T,1,2,,,,,,,,,,,,,,, -III.15.12,10,95,E_n,2*U*(1-cos(k*d)),3,U,1,5,k,1,5,d,1,5,,,,,,,,,,,,,,,,,,,,, -III.15.14,10,96,m,(h/(2*pi))**2/(2*E_n*d**2),3,h,1,5,E_n,1,5,d,1,5,,,,,,,,,,,,,,,,,,,,, -III.15.27,10,97,k,2*pi*alpha/(n*d),3,alpha,1,5,n,1,5,d,1,5,,,,,,,,,,,,,,,,,,,,, -III.17.37,10,98,f,beta*(1+alpha*cos(theta)),3,beta,1,5,alpha,1,5,theta,1,5,,,,,,,,,,,,,,,,,,,,, -III.19.51,10,99,E_n,-m*q**4/(2*(4*pi*epsilon)**2*(h/(2*pi))**2)*(1/n**2),5,m,1,5,q,1,5,h,1,5,n,1,5,epsilon,1,5,,,,,,,,,,,,,,, -III.21.20,10,100,j,-rho_c_0*q*A_vec/m,4,rho_c_0,1,5,q,1,5,A_vec,1,5,m,1,5,,,,,,,,,,,,,,,,,, diff --git a/docs/backend.md b/docs/backend.md index 0c7afdbf80285e4f57ad6f2dc2d0fc36a8ea2a70..b7575d143cf13ba82753078a2709e8b19c103866 100644 --- a/docs/backend.md +++ b/docs/backend.md @@ -2,27 +2,73 @@ If you have explored the [options](options.md) and [PySRRegressor reference](api.md), and still haven't figured out how to specify a constraint or objective required for your problem, you might consider editing the backend. The backend of PySR is written as a pure Julia package under the name [SymbolicRegression.jl](https://github.com/MilesCranmer/SymbolicRegression.jl). -This package is accessed with [`PyJulia`](https://github.com/JuliaPy/pyjulia), which allows us to transfer objects back and forth between the Python and Julia runtimes. +This package is accessed with [`juliacall`](https://github.com/JuliaPy/PythonCall.jl), which allows us to transfer objects back and forth between the Python and Julia runtimes. PySR gives you access to everything in SymbolicRegression.jl, but there are some specific use-cases which require modifications to the backend itself. Generally you can do this as follows: -1. Clone a copy of the backend: -``` +## 1. Check out the source code + +Clone a copy of the backend as well as PySR: + +```bash git clone https://github.com/MilesCranmer/SymbolicRegression.jl +git clone https://github.com/MilesCranmer/PySR +``` + +You may wish to check out the specific versions, which you can do with: + +```bash +cd PySR +git checkout + +# You can see the current backend version in `pysr/juliapkg.json` +cd ../SymbolicRegression.jl +git checkout +``` + +## 2. Edit the source to your requirements + +The main search code can be found in `src/SymbolicRegression.jl`. + +Here are some tips: + +- The documentation for the backend is given [here](https://astroautomata.com/SymbolicRegression.jl/dev/). +- Throughout the package, you will often see template functions which typically use a symbol `T` (such as in the string `where {T<:Real}`). Here, `T` is simply the datatype of the input data and stored constants, such as `Float32` or `Float64`. Writing functions in this way lets us write functions generic to types, while still having access to the specific type specified at compilation time. +- Expressions are stored as binary trees, using the `Node{T}` type, described [here](https://astroautomata.com/SymbolicRegression.jl/dev/types/#SymbolicRegression.CoreModule.EquationModule.Node). +- For reference, the main loop itself is found in the `equation_search` function inside [`src/SymbolicRegression.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl). +- Parts of the code which are typically edited by users include: + - [`src/CheckConstraints.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/CheckConstraints.jl), particularly the function `check_constraints`. This function checks whether a given expression satisfies constraints, such as having a complexity lower than `maxsize`, and whether it contains any forbidden nestings of functions. + - Note that all expressions, *even intermediate expressions*, must comply with constraints. Therefore, make sure that evolution can still reach your desired expression (with one mutation at a time), before setting a hard constraint. In other cases you might want to instead put in the loss function. + - [`src/Options.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/Options.jl), as well as the struct definition in [`src/OptionsStruct.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl). This file specifies all the options used in the search: an instance of `Options` is typically available throughout every function in `SymbolicRegression.jl`. If you add new functionality to the backend, and wish to make it parameterizable (including from PySR), you should specify it in the options. + +## 3. Let PySR use the modified backend + +Once you have made your changes, you should edit the `pysr/juliapkg.json` file +in the PySR repository to point to this local copy. +Do this by removing the `"version"` key and adding a `"dev"` and `"path"` key: + +```json + ... + "packages": { + "SymbolicRegression": { + "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", + "dev": true, + "path": "/path/to/SymbolicRegression.jl" + }, + ... ``` -2. Edit the source code in `src/` to your requirements: - - The documentation for the backend is given [here](https://astroautomata.com/SymbolicRegression.jl/dev/). - - Throughout the package, you will often see template functions which typically use a symbol `T` (such as in the string `where {T<:Real}`). Here, `T` is simply the datatype of the input data and stored constants, such as `Float32` or `Float64`. Writing functions in this way lets us write functions generic to types, while still having access to the specific type specified at compilation time. - - Expressions are stored as binary trees, using the `Node{T}` type, described [here](https://astroautomata.com/SymbolicRegression.jl/dev/types/#SymbolicRegression.CoreModule.EquationModule.Node). - - Parts of the code which are typically edited by users include: - - [`src/LossFunctions.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/LossFunctions.jl), particularly the function `eval_loss`. This function assigns a loss to a given expression, using `eval_tree_array` to evaluate it, and `loss` to compute the loss with respect to the dataset. - - [`src/CheckConstraints.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/CheckConstraints.jl), particularly the function `check_constraints`. This function checks whether a given expression satisfies constraints, such as having a complexity lower than `maxsize`, and whether it contains any forbidden nestings of functions. - - Note that all expressions, *even intermediate expressions*, must comply with constraints. Therefore, make sure that evolution can still reach your desired expression (with one mutation at a time), before setting a hard constraint. In other cases you might want to instead put in the loss function. - - [`src/Options.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/Options.jl), as well as the struct definition in [`src/OptionsStruct.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl). This file specifies all the options used in the search: an instance of `Options` is typically available throughout every function in `SymbolicRegression.jl`. If you add new functionality to the backend, and wish to make it parameterizable (including from PySR), you should specify it in the options. - - For reference, the main loop itself is found in the `equation_search` function inside [`src/SymbolicRegression.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl). -3. Specify the directory of `SymbolicRegression.jl` to PySR by setting `julia_project` in the `PySRRegressor` object, and run `.fit` when you're ready. That's it! No compilation or build steps required. - - Note that it will automatically update your project by default; to turn this off, set `update=False`. + +You can then install PySR with this modified backend by running: + +```bash +cd PySR +pip install . +``` + +For more information on `juliapkg.json`, see [`pyjuliapkg`](https://github.com/JuliaPy/pyjuliapkg). + +## Additional notes If you get comfortable enough with the backend, you might consider using the Julia package directly: the API is given on the [SymbolicRegression.jl documentation](https://astroautomata.com/SymbolicRegression.jl/dev/). diff --git a/docs/examples.md b/docs/examples.md index 1b3f7e5250532abe78bd4b821432cd21d8f93199..69fb98f8df33698b2606707a87fee32de48294bb 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -144,7 +144,7 @@ but there are still some additional steps you can take to reduce the effect of n One thing you could do, which we won't detail here, is to create a custom log-likelihood given some assumed noise model. By passing weights to the fit function, and -defining a custom loss function such as `loss="myloss(x, y, w) = w * (x - y)^2"`, +defining a custom loss function such as `elementwise_loss="myloss(x, y, w) = w * (x - y)^2"`, you can define any sort of log-likelihood you wish. (However, note that it must be bounded at zero) However, the simplest thing to do is preprocessing, just like for feature selection. To do this, @@ -189,12 +189,10 @@ where $p_i$ is the $i$th prime number, and $x$ is the input feature. Let's see if we can discover this using the [Primes.jl](https://github.com/JuliaMath/Primes.jl) package. -First, let's manually initialize the Julia backend -(here, with 8 threads and `-O3`): +First, let's get the Julia backend: ```python -import pysr -jl = pysr.julia_helpers.init_julia(julia_kwargs={"threads": 8, "optimize": 3}) +from pysr import jl ``` `jl` stores the Julia runtime. @@ -203,7 +201,7 @@ Now, let's run some Julia code to add the Primes.jl package to the PySR environment: ```python -jl.eval(""" +jl.seval(""" import Pkg Pkg.add("Primes") """) @@ -213,13 +211,13 @@ This imports the Julia package manager, and uses it to install `Primes.jl`. Now let's import `Primes.jl`: ```python -jl.eval("import Primes") +jl.seval("import Primes") ``` Now, we define a custom operator: ```python -jl.eval(""" +jl.seval(""" function p(i::T) where T if (0.5 < i < 1000) return T(Primes.prime(round(Int, i))) @@ -237,7 +235,7 @@ If in-bounds, it rounds it to the nearest integer, compures the corresponding pr converts it to the same type as input. Next, let's generate a list of primes for our test dataset. -Since we are using PyJulia, we can just call `p` directly to do this: +Since we are using juliacall, we can just call `p` directly to do this: ```python primes = {i: jl.p(i*1.0) for i in range(1, 999)} @@ -382,7 +380,7 @@ end model = PySRRegressor( niterations=100, binary_operators=["*", "+", "-"], - full_objective=objective, + loss_function=objective, ) ``` @@ -464,7 +462,7 @@ let's also create a custom loss function that looks at the error in log-space: ```python -loss = """function loss_fnc(prediction, target) +elementwise_loss = """function loss_fnc(prediction, target) scatter_loss = abs(log((abs(prediction)+1e-20) / (abs(target)+1e-20))) sign_loss = 10 * (sign(prediction) - sign(target))^2 return scatter_loss + sign_loss @@ -478,7 +476,7 @@ Now let's define our model: model = PySRRegressor( binary_operators=["+", "-", "*", "/"], unary_operators=["square"], - loss=loss, + elementwise_loss=elementwise_loss, complexity_of_constants=2, maxsize=25, niterations=100, diff --git a/docs/options.md b/docs/options.md index 5eee94d8e7c55566c7a7edab2f44c3ea2086c2d8..0ccacbcabb8368d9fea78389f79f3567df3e786d 100644 --- a/docs/options.md +++ b/docs/options.md @@ -78,11 +78,11 @@ with the equations. Each cycle considers every 10-equation subsample (re-sampled for each individual 10, unless `fast_cycle` is set in which case the subsamples are separate groups of equations) a single time, producing one mutated equation for each. -The parameter `ncyclesperiteration` defines how many times this +The parameter `ncycles_per_iteration` defines how many times this occurs before the equations are compared to the hall of fame, and new equations are migrated from the hall of fame, or from other populations. It also controls how slowly annealing occurs. You may find that increasing -`ncyclesperiteration` results in a higher cycles-per-second, as the head +`ncycles_per_iteration` results in a higher cycles-per-second, as the head worker needs to reduce and distribute new equations less often, and also increases diversity. But at the same time, a smaller number it might be that migrating equations from the hall of fame helps @@ -243,7 +243,7 @@ train the parameters within JAX (and is differentiable). The default loss is mean-square error, and weighted mean-square error. One can pass an arbitrary Julia string to define a custom loss, using, -e.g., `loss="myloss(x, y) = abs(x - y)^1.5"`. For more details, +e.g., `elementwise_loss="myloss(x, y) = abs(x - y)^1.5"`. For more details, see the [Losses](https://milescranmer.github.io/SymbolicRegression.jl/dev/losses/) page for SymbolicRegression.jl. @@ -253,26 +253,26 @@ Here are some additional examples: abs(x-y) loss ```python -PySRRegressor(..., loss="f(x, y) = abs(x - y)^1.5") +PySRRegressor(..., elementwise_loss="f(x, y) = abs(x - y)^1.5") ``` Note that the function name doesn't matter: ```python -PySRRegressor(..., loss="loss(x, y) = abs(x * y)") +PySRRegressor(..., elementwise_loss="loss(x, y) = abs(x * y)") ``` With weights: ```python -model = PySRRegressor(..., loss="myloss(x, y, w) = w * abs(x - y)") +model = PySRRegressor(..., elementwise_loss="myloss(x, y, w) = w * abs(x - y)") model.fit(..., weights=weights) ``` Weights can be used in arbitrary ways: ```python -model = PySRRegressor(..., weights=weights, loss="myloss(x, y, w) = abs(x - y)^2/w^2") +model = PySRRegressor(..., weights=weights, elementwise_loss="myloss(x, y, w) = abs(x - y)^2/w^2") model.fit(..., weights=weights) ``` @@ -280,13 +280,13 @@ Built-in loss (faster) (see [losses](https://astroautomata.com/SymbolicRegressio This one computes the L3 norm: ```python -PySRRegressor(..., loss="LPDistLoss{3}()") +PySRRegressor(..., elementwise_loss="LPDistLoss{3}()") ``` Can also uses these losses for weighted (weighted-average): ```python -model = PySRRegressor(..., weights=weights, loss="LPDistLoss{3}()") +model = PySRRegressor(..., weights=weights, elementwise_loss="LPDistLoss{3}()") model.fit(..., weights=weights) ``` diff --git a/docs/tuning.md b/docs/tuning.md index aac4d1260695cca40d4c64022383260ba2b6dd22..29775ee4023ce917a925f1776361ce36484372f8 100644 --- a/docs/tuning.md +++ b/docs/tuning.md @@ -14,12 +14,12 @@ I run from IPython (Jupyter Notebooks don't work as well[^1]) on the head node o 2. Use only the operators I think it needs and no more. 3. Increase `populations` to `3*num_cores`. 4. If my dataset is more than 1000 points, I either subsample it (low-dimensional and not much noise) or set `batching=True` (high-dimensional or very noisy, so it needs to evaluate on all the data). -5. While on a laptop or single node machine, you might leave the default `ncyclesperiteration`, on a cluster with ~100 cores I like to set `ncyclesperiteration` to maybe `5000` or so, until the head node occupation is under `10%`. (A larger value means the workers talk less frequently to eachother, which is useful when you have many workers!) +5. While on a laptop or single node machine, you might leave the default `ncycles_per_iteration`, on a cluster with ~100 cores I like to set `ncycles_per_iteration` to maybe `5000` or so, until the head node occupation is under `10%`. (A larger value means the workers talk less frequently to eachother, which is useful when you have many workers!) 6. Set `constraints` and `nested_constraints` as strict as possible. These can help quite a bit with exploration. Typically, if I am using `pow`, I would set `constraints={"pow": (9, 1)}`, so that power laws can only have a variable or constant as their exponent. If I am using `sin` and `cos`, I also like to set `nested_constraints={"sin": {"sin": 0, "cos": 0}, "cos": {"sin": 0, "cos": 0}}`, so that sin and cos can't be nested, which seems to happen frequently. (Although in practice I would just use `sin`, since the search could always add a phase offset!) 7. Set `maxsize` a bit larger than the final size you want. e.g., if you want a final equation of size `30`, you might set this to `35`, so that it has a bit of room to explore. 8. I typically don't use `maxdepth`, but if I do, I set it strictly, while also leaving a bit of room for exploration. e.g., if you want a final equation limited to a depth of `5`, you might set this to `6` or `7`, so that it has a bit of room to explore. 9. Set `parsimony` equal to about the minimum loss you would expect, divided by 5-10. e.g., if you expect the final equation to have a loss of `0.001`, you might set `parsimony=0.0001`. -10. Set `weight_optimize` to some larger value, maybe `0.001`. This is very important if `ncyclesperiteration` is large, so that optimization happens more frequently. +10. Set `weight_optimize` to some larger value, maybe `0.001`. This is very important if `ncycles_per_iteration` is large, so that optimization happens more frequently. 11. Set `turbo` to `True`. This may or not work, if there's an error just turn it off (some operators are not SIMD-capable). If it does work, it should give you a nice 20% speedup. 12. For final runs, after I have tuned everything, I typically set `niterations` to some very large value, and just let it run for a week until my job finishes (genetic algorithms tend not to converge, they can look like they settle down, but then find a new family of expression, and explore a new space). If I am satisfied with the current equations (which are visible either in the terminal or in the saved csv file), I quit the job early. diff --git a/environment.yml b/environment.yml index 9eea93761baad4a8bd448dca2e06c7ddf1b2c653..610662e4d93440606895cf6d477e71d815697a05 100644 --- a/environment.yml +++ b/environment.yml @@ -2,12 +2,11 @@ name: test channels: - conda-forge dependencies: - - sympy - - pandas - - numpy - - scikit-learn - - setuptools - - pyjulia - - openlibm - - openspecfun - - click + - python>=3.7 + - sympy>=1.0.0,<2.0.0 + - pandas>=0.21.0,<3.0.0 + - numpy>=1.13.0,<2.0.0 + - scikit-learn>=1.0.0,<2.0.0 + - pyjuliacall>=0.9.15,<0.10.0 + - click>=7.0.0,<9.0.0 + - typing_extensions>=4.0.0,<5.0.0 diff --git a/example.py b/example.py index e3b7deeecb1530f38f9e88b67ef80e33d1216b9b..c39cab9cc4bc8a0ec7ff2f274c68c5d2b43c8922 100644 --- a/example.py +++ b/example.py @@ -18,7 +18,7 @@ model = PySRRegressor( ], extra_sympy_mappings={"inv": lambda x: 1 / x}, # ^ Define operator for SymPy as well - loss="loss(x, y) = (x - y)^2", + elementwise_loss="loss(x, y) = (x - y)^2", # ^ Custom loss function (julia syntax) ) diff --git a/examples/pysr_demo.ipynb b/examples/pysr_demo.ipynb index 0b1e5abce84289b532bfaa5b605e926dd599f952..53606cba905869868ab6262488a4eb530540c246 100644 --- a/examples/pysr_demo.ipynb +++ b/examples/pysr_demo.ipynb @@ -15,68 +15,9 @@ "id": "tQ1r1bbb0yBv" }, "source": [ - "\n", "## Instructions\n", "1. Work on a copy of this notebook: _File_ > _Save a copy in Drive_ (you will need a Google account).\n", - "2. (Optional) If you would like to do the deep learning component of this tutorial, turn on the GPU with Edit->Notebook settings->Hardware accelerator->GPU\n", - "3. Execute the following cell (click on it and press Ctrl+Enter) to install Julia. This may take a minute or so.\n", - "4. Continue to the next section.\n", - "\n", - "_Notes_:\n", - "* If your Colab Runtime gets reset (e.g., due to inactivity), repeat steps 3, 4.\n", - "* After installation, if you want to change the Julia version or activate/deactivate the GPU, you will need to reset the Runtime: _Runtime_ > _Delete and disconnect runtime_ and repeat steps 2-4." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "COndi88gbDgO" - }, - "source": [ - "**Run the following code to install Julia**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "GIeFXS0F0zww", - "outputId": "5399ed75-f77f-47c5-e53b-4b2f231f2839" - }, - "outputs": [], - "source": [ - "!curl -fsSL https://install.julialang.org | sh -s -- -y --default-channel 1.10" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Iu9X-Y-YNmwM", - "outputId": "ee14af65-043a-4ad6-efa0-3cdcc48a4eb8" - }, - "outputs": [], - "source": [ - "# Make julia available on PATH:\n", - "!ln -s $HOME/.juliaup/bin/julia /usr/local/bin/julia\n", - "\n", - "# Test it works:\n", - "!julia --version" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ORv1c6xvbDgV" - }, - "source": [ - "Install PySR" + "2. (Optional) If you would like to do the deep learning component of this tutorial, turn on the GPU with Edit->Notebook settings->Hardware accelerator->GPU\n" ] }, { @@ -91,36 +32,23 @@ }, "outputs": [], "source": [ - "!pip install pysr && python -m pysr install" + "!pip install -U pysr" ] }, { "cell_type": "markdown", - "metadata": { - "id": "etTMEV0wDqld" - }, + "metadata": {}, "source": [ - "Colab's printing is non-standard, so we need to manually initialize Julia and redirect its printing. Normally, however, this is not required, and PySR will automatically start Julia during the first call to `.fit`:" + "Julia and Julia dependencies are installed at first import:" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "j666aOI8xWF_" - }, + "metadata": {}, "outputs": [], "source": [ - "def init_colab_printing():\n", - " from pysr.julia_helpers import init_julia\n", - " from julia.tools import redirect_output_streams\n", - "\n", - " julia_kwargs = dict(optimize=3, threads=\"auto\", compiled_modules=False)\n", - " init_julia(julia_kwargs=julia_kwargs)\n", - " redirect_output_streams()\n", - "\n", - "\n", - "init_colab_printing()" + "import pysr" ] }, { @@ -129,7 +57,7 @@ "id": "qeCPKd9wldEK" }, "source": [ - "Now, let's import all of our libraries:" + "Now, let's import everything else as well as the PySRRegressor:\n" ] }, { @@ -233,7 +161,7 @@ " niterations=30,\n", " binary_operators=[\"+\", \"*\"],\n", " unary_operators=[\"cos\", \"exp\", \"sin\"],\n", - " **default_pysr_params\n", + " **default_pysr_params,\n", ")\n", "\n", "model.fit(X, y)" @@ -648,7 +576,7 @@ "outputs": [], "source": [ "model = PySRRegressor(\n", - " loss=\"myloss(x, y, w) = w * abs(x - y)\", # Custom loss function with weights.\n", + " elementwise_loss=\"myloss(x, y, w) = w * abs(x - y)\", # Custom loss function with weights.\n", " niterations=20,\n", " populations=20, # Use more populations\n", " binary_operators=[\"+\", \"*\"],\n", @@ -815,26 +743,7 @@ "where $p_i$ is the $i$th prime number, and $x$ is the input feature.\n", "\n", "Let's see if we can discover this using\n", - "the [Primes.jl](https://github.com/JuliaMath/Primes.jl) package.\n", - "\n", - "First, let's get the Julia backend\n", - "Here, we might choose to manually specify unlimited threads, `-O3`,\n", - "and `compile_modules=False`, although this will only propagate if Julia has not yet started:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yUC4BMuHG-KN" - }, - "outputs": [], - "source": [ - "import pysr\n", - "\n", - "jl = pysr.julia_helpers.init_julia(\n", - " julia_kwargs=dict(optimize=3, threads=\"auto\", compiled_modules=False)\n", - ")" + "the [Primes.jl](https://github.com/JuliaMath/Primes.jl) package." ] }, { @@ -859,7 +768,9 @@ }, "outputs": [], "source": [ - "jl.eval(\n", + "from pysr import jl\n", + "\n", + "jl.seval(\n", " \"\"\"\n", "import Pkg\n", "Pkg.add(\"Primes\")\n", @@ -885,7 +796,24 @@ }, "outputs": [], "source": [ - "jl.eval(\"import Primes\")" + "jl.seval(\"using Primes: prime\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that PySR should automatically load the `juliacall.ipython` extension for you,\n", + "which means that you can also execute Julia code in the notebook using the `%%julia` magic:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%julia using Primes: prime" ] }, { @@ -906,11 +834,11 @@ }, "outputs": [], "source": [ - "jl.eval(\n", + "jl.seval(\n", " \"\"\"\n", "function p(i::T) where T\n", " if 0.5 < i < 1000\n", - " return T(Primes.prime(round(Int, i)))\n", + " return T(prime(round(Int, i)))\n", " else\n", " return T(NaN)\n", " end\n", @@ -919,6 +847,29 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or, equivalently:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%julia\n", + "function p(i::T) where T\n", + " if 0.5 < i < 1000\n", + " return T(prime(round(Int, i)))\n", + " else\n", + " return T(NaN)\n", + " end\n", + "end" + ] + }, { "cell_type": "markdown", "metadata": { @@ -947,7 +898,7 @@ "(However, note that this version assumes 64-bit float input, rather than any input type `T`)\n", "\n", "Next, let's generate a list of primes for our test dataset.\n", - "Since we are using PyJulia, we can just call `p` directly to do this:\n" + "Since we are using juliacall, we can just call `p` directly to do this:\n" ] }, { @@ -1382,7 +1333,7 @@ "\n", "> **Warning**\n", ">\n", - "> First, let's save the data, because sometimes PyTorch and PyJulia's C bindings interfere and cause the colab kernel to crash. If we need to restart, we can just load the data without having to retrain the network:" + "> First, let's save the data, because sometimes PyTorch and juliacall's C bindings interfere and cause the colab kernel to crash. If we need to restart, we can just load the data without having to retrain the network:" ] }, { @@ -1413,7 +1364,7 @@ "id": "krhaNlwFG-KT" }, "source": [ - "We can now load the data, including after a crash (be sure to re-run the import cells at the top of this notebook, including the one that starts PyJulia)." + "We can now load the data, including after a crash (be sure to re-run the import cells at the top of this notebook, including the one that starts juliacall)." ] }, { @@ -1467,7 +1418,7 @@ "id": "1a738a33" }, "source": [ - "If this segfaults, restart the notebook, and run the initial imports and PyJulia part, but skip the PyTorch training. This is because PyTorch's C binding tends to interefere with PyJulia. You can then re-run the `pkl.load` cell to import the data." + "If this segfaults, restart the notebook, and run the initial imports and juliacall part, but skip the PyTorch training. This is because PyTorch's C binding tends to interefere with juliacall. You can then re-run the `pkl.load` cell to import the data." ] }, { diff --git a/pyproject.toml b/pyproject.toml index 5d7bf33d948c45d40befd4769d0c15f74615472b..f30480e0d86dc7c3432cb3c5afadf584ed11f465 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,31 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "pysr" +version = "0.17.0" +authors = [ + {name = "Miles Cranmer", email = "miles.cranmer@gmail.com"}, +] +description = "Simple and efficient symbolic regression" +readme = {file = "README.md", content-type = "text/markdown"} +license = {file = "LICENSE"} +requires-python = ">=3.7" +classifiers = [ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + "License :: OSI Approved :: Apache Software License" +] +dynamic = ["dependencies"] + +[tool.setuptools] +packages = ["pysr", "pysr._cli", "pysr.test"] +include-package-data = false +package-data = {pysr = ["juliapkg.json"]} + +[tool.setuptools.dynamic] +dependencies = {file = "requirements.txt"} + [tool.isort] profile = "black" diff --git a/pysr/.gitignore b/pysr/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..98527864664d32f798edc06a53131e8d5a068295 --- /dev/null +++ b/pysr/.gitignore @@ -0,0 +1 @@ +version.py diff --git a/pysr/__init__.py b/pysr/__init__.py index 1f2e9775df6c61fca62ce2c90f075d20d2850e95..e71e19905d8a12e926a9478e0185457fad2e3366 100644 --- a/pysr/__init__.py +++ b/pysr/__init__.py @@ -1,26 +1,23 @@ -import sys -import warnings - -if sys.version_info >= (3, 12, 0): - warnings.warn( - "PySR experiences occassional segfaults with Python 3.12. " - + "Please use an earlier version of Python with PySR until this issue is resolved." - ) +# This must be imported as early as possible to prevent +# library linking issues caused by numpy/pytorch/etc. importing +# old libraries: +from .julia_import import jl, SymbolicRegression # isort:skip from . import sklearn_monkeypatch -from .deprecated import best, best_callable, best_row, best_tex, pysr +from .deprecated import best, best_callable, best_row, best_tex, install, pysr from .export_jax import sympy2jax from .export_torch import sympy2torch -from .feynman_problems import FeynmanProblem, Problem -from .julia_helpers import install from .sr import PySRRegressor + +# This file is created by setuptools_scm during the build process: from .version import __version__ __all__ = [ + "jl", + "SymbolicRegression", "sklearn_monkeypatch", "sympy2jax", "sympy2torch", - "FeynmanProblem", "Problem", "install", "PySRRegressor", diff --git a/pysr/__main__.py b/pysr/__main__.py index e8dbdf901ccfadebea17852024babdcecd0d96df..e196f3c4243118f128bcdf75aac87883a203e8ef 100644 --- a/pysr/__main__.py +++ b/pysr/__main__.py @@ -1,4 +1,4 @@ -from pysr._cli.main import pysr as _cli +from ._cli.main import pysr as _cli if __name__ == "__main__": _cli(prog_name="pysr") diff --git a/pysr/_cli/main.py b/pysr/_cli/main.py index d82ab79edf7987362e1c86183f4f112e5787e0b9..38b6fcfd417b4b3c20b7e02746787ede451c964b 100644 --- a/pysr/_cli/main.py +++ b/pysr/_cli/main.py @@ -1,6 +1,17 @@ +import sys +import unittest +import warnings + import click -from ..julia_helpers import install +from ..test import ( + get_runtests_cli, + runtests, + runtests_dev, + runtests_jax, + runtests_startup, + runtests_torch, +) @click.group("pysr") @@ -9,15 +20,13 @@ def pysr(context): ctx = context -@pysr.command("install", help="Install Julia dependencies for PySR.") +@pysr.command("install", help="DEPRECATED (dependencies are now installed at import).") @click.option( "-p", "julia_project", "--project", default=None, type=str, - help="Install in a specific Julia project (e.g., a local copy of SymbolicRegression.jl).", - metavar="PROJECT_DIRECTORY", ) @click.option("-q", "--quiet", is_flag=True, default=False, help="Disable logging.") @click.option( @@ -25,14 +34,55 @@ def pysr(context): "precompile", flag_value=True, default=None, - help="Force precompilation of Julia libraries.", ) @click.option( "--no-precompile", "precompile", flag_value=False, default=None, - help="Disable precompilation.", ) def _install(julia_project, quiet, precompile): - install(julia_project, quiet, precompile) + warnings.warn( + "This command is deprecated. Julia dependencies are now installed at first import." + ) + + +TEST_OPTIONS = {"main", "jax", "torch", "cli", "dev", "startup"} + + +@pysr.command("test") +@click.argument("tests", nargs=1) +def _tests(tests): + """Run parts of the PySR test suite. + + Choose from main, jax, torch, cli, dev, and startup. You can give multiple tests, separated by commas. + """ + test_cases = [] + for test in tests.split(","): + if test == "main": + test_cases.extend(runtests(just_tests=True)) + elif test == "jax": + test_cases.extend(runtests_jax(just_tests=True)) + elif test == "torch": + test_cases.extend(runtests_torch(just_tests=True)) + elif test == "cli": + runtests_cli = get_runtests_cli() + test_cases.extend(runtests_cli(just_tests=True)) + elif test == "dev": + test_cases.extend(runtests_dev(just_tests=True)) + elif test == "startup": + test_cases.extend(runtests_startup(just_tests=True)) + else: + warnings.warn(f"Invalid test {test}. Skipping.") + + loader = unittest.TestLoader() + suite = unittest.TestSuite() + for test_case in test_cases: + suite.addTests(loader.loadTestsFromTestCase(test_case)) + runner = unittest.TextTestRunner() + results = runner.run(suite) + # Normally unittest would run this, but here we have + # to do it manually to get the exit code. + + if not results.wasSuccessful(): + sys.exit(1) diff --git a/pysr/deprecated.py b/pysr/deprecated.py index 5f3065e677249c99c7e49e2ad468a5b9377ac2b8..5794563e7a961e59eac5826cafbcbfcdf4f33946 100644 --- a/pysr/deprecated.py +++ b/pysr/deprecated.py @@ -1,6 +1,27 @@ """Various functions to deprecate features.""" import warnings +from .julia_import import jl + + +def install(*args, **kwargs): + del args, kwargs + warnings.warn( + "The `install` function has been removed. " + "PySR now uses the `juliacall` package to install its dependencies automatically at import time. ", + FutureWarning, + ) + + +def init_julia(*args, **kwargs): + del args, kwargs + warnings.warn( + "The `init_julia` function has been removed. " + "Julia is now initialized automatically at import time.", + FutureWarning, + ) + return jl + def pysr(X, y, weights=None, **kwargs): # pragma: no cover from .sr import PySRRegressor @@ -55,38 +76,28 @@ def best_callable(*args, **kwargs): # pragma: no cover ) -def make_deprecated_kwargs_for_pysr_regressor(): - """Create dict of deprecated kwargs.""" - - deprecation_string = """ - fractionReplaced => fraction_replaced - fractionReplacedHof => fraction_replaced_hof - npop => population_size - hofMigration => hof_migration - shouldOptimizeConstants => should_optimize_constants - weightAddNode => weight_add_node - weightDeleteNode => weight_delete_node - weightDoNothing => weight_do_nothing - weightInsertNode => weight_insert_node - weightMutateConstant => weight_mutate_constant - weightMutateOperator => weight_mutate_operator - weightSwapOperands => weight_swap_operands - weightRandomize => weight_randomize - weightSimplify => weight_simplify - crossoverProbability => crossover_probability - perturbationFactor => perturbation_factor - batchSize => batch_size - warmupMaxsizeBy => warmup_maxsize_by - useFrequency => use_frequency - useFrequencyInTournament => use_frequency_in_tournament - """ - # Turn this into a dict: - deprecated_kwargs = {} - for line in deprecation_string.splitlines(): - line = line.replace(" ", "") - if line == "": - continue - old, new = line.split("=>") - deprecated_kwargs[old] = new - - return deprecated_kwargs +DEPRECATED_KWARGS = { + "fractionReplaced": "fraction_replaced", + "fractionReplacedHof": "fraction_replaced_hof", + "npop": "population_size", + "hofMigration": "hof_migration", + "shouldOptimizeConstants": "should_optimize_constants", + "weightAddNode": "weight_add_node", + "weightDeleteNode": "weight_delete_node", + "weightDoNothing": "weight_do_nothing", + "weightInsertNode": "weight_insert_node", + "weightMutateConstant": "weight_mutate_constant", + "weightMutateOperator": "weight_mutate_operator", + "weightSwapOperands": "weight_swap_operands", + "weightRandomize": "weight_randomize", + "weightSimplify": "weight_simplify", + "crossoverProbability": "crossover_probability", + "perturbationFactor": "perturbation_factor", + "batchSize": "batch_size", + "warmupMaxsizeBy": "warmup_maxsize_by", + "useFrequency": "use_frequency", + "useFrequencyInTournament": "use_frequency_in_tournament", + "ncyclesperiteration": "ncycles_per_iteration", + "loss": "elementwise_loss", + "full_objective": "loss_function", +} diff --git a/pysr/feynman_problems.py b/pysr/feynman_problems.py deleted file mode 100644 index b64b4139721bd015939aed1a91e17eeb36df38f8..0000000000000000000000000000000000000000 --- a/pysr/feynman_problems.py +++ /dev/null @@ -1,176 +0,0 @@ -import csv -from functools import partial -from pathlib import Path - -import numpy as np - -from .deprecated import best, pysr - -PKG_DIR = Path(__file__).parents[1] -FEYNMAN_DATASET = PKG_DIR / "datasets" / "FeynmanEquations.csv" - - -class Problem: - """ - Problem API to work with PySR. - - Has attributes: X, y as pysr accepts, form which is a string representing the correct equation and variable_names - - Should be able to call pysr(problem.X, problem.y, var_names=problem.var_names) and have it work - """ - - def __init__(self, X, y, form=None, variable_names=None): - self.X = X - self.y = y - self.form = form - self.variable_names = variable_names - - -class FeynmanProblem(Problem): - """ - Stores the data for the problems from the 100 Feynman Equations on Physics. - This is the benchmark used in the AI Feynman Paper - """ - - def __init__(self, row, gen=False, dp=500): - """ - row: a row read as a dict from the FeynmanEquations dataset provided in the datasets folder of the repo - gen: If true the problem will have dp X and y values randomly generated else they will be None - """ - self.eq_id = row["Filename"] - self.n_vars = int(row["# variables"]) - super(FeynmanProblem, self).__init__( - None, - None, - form=row["Formula"], - variable_names=[row[f"v{i + 1}_name"] for i in range(self.n_vars)], - ) - self.low = [float(row[f"v{i+1}_low"]) for i in range(self.n_vars)] - self.high = [float(row[f"v{i+1}_high"]) for i in range(self.n_vars)] - self.dp = dp - if gen: - self.X = np.random.uniform(0.01, 25, size=(self.dp, self.n_vars)) - d = {} - for var in range(len(self.variable_names)): - d[self.variable_names[var]] = self.X[:, var] - d["exp"] = np.exp - d["sqrt"] = np.sqrt - d["pi"] = np.pi - d["cos"] = np.cos - d["sin"] = np.sin - d["tan"] = np.tan - d["tanh"] = np.tanh - d["ln"] = np.log - d["log"] = np.log # Quite sure the Feynman dataset has no base 10 logs - d["arcsin"] = np.arcsin - self.y = eval(self.form, d) - - def __str__(self): - return f"Feynman Equation: {self.eq_id}|Form: {self.form}" - - def __repr__(self): - return str(self) - - -def mk_problems(first=100, gen=False, dp=500, data_dir=FEYNMAN_DATASET): - """ - - first: the first "first" equations from the dataset will be made into problems - data_dir: the path pointing to the Feynman Equations csv - returns: list of FeynmanProblems - """ - ret = [] - with open(data_dir) as csvfile: - reader = csv.DictReader(csvfile) - for i, row in enumerate(reader): - if i > first: - break - if row["Filename"] == "": - continue - p = FeynmanProblem(row, gen=gen, dp=dp) - ret.append(p) - return ret - - -def run_on_problem(problem, verbosity=0, multiprocessing=True): - """ - Takes in a problem and returns a tuple: (equations, best predicted equation, actual equation) - """ - from time import time - - starting = time() - equations = pysr( - problem.X, - problem.y, - variable_names=problem.variable_names, - verbosity=verbosity, - ) - timing = time() - starting - others = {"time": timing, "problem": problem} - if not multiprocessing: - others["equations"] = equations - return str(best(equations)), problem.form, others - - -def do_feynman_experiments_parallel( - first=100, - verbosity=0, - dp=500, - output_file_path="FeynmanExperiment.csv", - data_dir=FEYNMAN_DATASET, -): - import multiprocessing as mp - - from tqdm import tqdm - - problems = mk_problems(first=first, gen=True, dp=dp, data_dir=data_dir) - ids = [] - predictions = [] - true_equations = [] - time_takens = [] - pool = mp.Pool() - results = [] - with tqdm(total=len(problems)) as pbar: - f = partial(run_on_problem, verbosity=verbosity) - for i, res in enumerate(pool.imap(f, problems)): - results.append(res) - pbar.update() - for res in results: - prediction, true_equation, others = res - problem = others["problem"] - ids.append(problem.eq_id) - predictions.append(prediction) - true_equations.append(true_equation) - time_takens.append(others["time"]) - with open(output_file_path, "a") as f: - writer = csv.writer(f, delimiter=",") - writer.writerow(["ID", "Predicted", "True", "Time"]) - for i in range(len(ids)): - writer.writerow([ids[i], predictions[i], true_equations[i], time_takens[i]]) - - -def do_feynman_experiments( - first=100, - verbosity=0, - dp=500, - output_file_path="FeynmanExperiment.csv", - data_dir=FEYNMAN_DATASET, -): - from tqdm import tqdm - - problems = mk_problems(first=first, gen=True, dp=dp, data_dir=data_dir) - ids = [] - predictions = [] - true_equations = [] - time_takens = [] - for problem in tqdm(problems): - prediction, true_equation, others = run_on_problem(problem, verbosity) - ids.append(problem.eq_id) - predictions.append(prediction) - true_equations.append(true_equation) - time_takens.append(others["time"]) - with open(output_file_path, "a") as f: - writer = csv.writer(f, delimiter=",") - writer.writerow(["ID", "Predicted", "True", "Time"]) - for i in range(len(ids)): - writer.writerow([ids[i], predictions[i], true_equations[i], time_takens[i]]) diff --git a/pysr/julia_helpers.py b/pysr/julia_helpers.py index e2f76090cf6d7a564d00bba2cc710dd04bfc959f..ad8341d4051bca4dbf235ff6c50f617e5a1ccc9d 100644 --- a/pysr/julia_helpers.py +++ b/pysr/julia_helpers.py @@ -1,284 +1,18 @@ """Functions for initializing the Julia environment and installing deps.""" -import os -import subprocess -import sys -import warnings -from pathlib import Path -from julia.api import JuliaError +import numpy as np +from juliacall import convert as jl_convert # type: ignore -from .version import __symbolic_regression_jl_version__, __version__ +from .deprecated import init_julia, install +from .julia_import import jl -juliainfo = None -julia_initialized = False -julia_kwargs_at_initialization = None -julia_activated_env = None +jl.seval("using Serialization: Serialization") +jl.seval("using PythonCall: PythonCall") +Serialization = jl.Serialization +PythonCall = jl.PythonCall -def _load_juliainfo(): - """Execute julia.core.JuliaInfo.load(), and store as juliainfo.""" - global juliainfo - - if juliainfo is None: - from julia.core import JuliaInfo - - try: - juliainfo = JuliaInfo.load(julia="julia") - except FileNotFoundError: - env_path = os.environ["PATH"] - raise FileNotFoundError( - f"Julia is not installed in your PATH. Please install Julia and add it to your PATH.\n\nCurrent PATH: {env_path}", - ) - - return juliainfo - - -def _get_julia_env_dir(): - # Have to manually get env dir: - try: - julia_env_dir_str = subprocess.run( - ["julia", "-e using Pkg; print(Pkg.envdir())"], - capture_output=True, - env=os.environ, - ).stdout.decode() - except FileNotFoundError: - env_path = os.environ["PATH"] - raise FileNotFoundError( - f"Julia is not installed in your PATH. Please install Julia and add it to your PATH.\n\nCurrent PATH: {env_path}", - ) - return Path(julia_env_dir_str) - - -def _set_julia_project_env(julia_project, is_shared): - if is_shared: - if is_julia_version_greater_eq(version=(1, 7, 0)): - os.environ["JULIA_PROJECT"] = "@" + str(julia_project) - else: - julia_env_dir = _get_julia_env_dir() - os.environ["JULIA_PROJECT"] = str(julia_env_dir / julia_project) - else: - os.environ["JULIA_PROJECT"] = str(julia_project) - - -def _get_io_arg(quiet): - io = "devnull" if quiet else "stderr" - io_arg = f"io={io}" if is_julia_version_greater_eq(version=(1, 6, 0)) else "" - return io_arg - - -def install(julia_project=None, quiet=False, precompile=None): # pragma: no cover - """ - Install PyCall.jl and all required dependencies for SymbolicRegression.jl. - - Also updates the local Julia registry. - """ - import julia - - _julia_version_assertion() - # Set JULIA_PROJECT so that we install in the pysr environment - processed_julia_project, is_shared = _process_julia_project(julia_project) - _set_julia_project_env(processed_julia_project, is_shared) - - if precompile == False: - os.environ["JULIA_PKG_PRECOMPILE_AUTO"] = "0" - - try: - julia.install(quiet=quiet) - except julia.tools.PyCallInstallError: - # Attempt to reset PyCall.jl's build: - subprocess.run( - [ - "julia", - "-e", - f'ENV["PYTHON"] = "{sys.executable}"; import Pkg; Pkg.build("PyCall")', - ], - ) - # Try installing again: - try: - julia.install(quiet=quiet) - except julia.tools.PyCallInstallError: - warnings.warn( - "PyCall.jl failed to install on second attempt. " - + "Please consult the GitHub issue " - + "https://github.com/MilesCranmer/PySR/issues/257 " - + "for advice on fixing this." - ) - - Main, init_log = init_julia(julia_project, quiet=quiet, return_aux=True) - io_arg = _get_io_arg(quiet) - - if precompile is None: - precompile = init_log["compiled_modules"] - - if not precompile: - Main.eval('ENV["JULIA_PKG_PRECOMPILE_AUTO"] = 0') - - if is_shared: - # Install SymbolicRegression.jl: - _add_sr_to_julia_project(Main, io_arg) - - Main.eval("using Pkg") - Main.eval(f"Pkg.instantiate({io_arg})") - - if precompile: - Main.eval(f"Pkg.precompile({io_arg})") - - if not quiet: - warnings.warn( - "It is recommended to restart Python after installing PySR's dependencies," - " so that the Julia environment is properly initialized." - ) - - -def _import_error(): - return """ - Required dependencies are not installed or built. Run the following command in your terminal: - python3 -m pysr install - """ - - -def _process_julia_project(julia_project): - if julia_project is None: - is_shared = True - processed_julia_project = f"pysr-{__version__}" - elif julia_project[0] == "@": - is_shared = True - processed_julia_project = julia_project[1:] - else: - is_shared = False - processed_julia_project = Path(julia_project) - return processed_julia_project, is_shared - - -def is_julia_version_greater_eq(juliainfo=None, version=(1, 6, 0)): - """Check if Julia version is greater than specified version.""" - if juliainfo is None: - juliainfo = _load_juliainfo() - current_version = ( - juliainfo.version_major, - juliainfo.version_minor, - juliainfo.version_patch, - ) - return current_version >= version - - -def _check_for_conflicting_libraries(): # pragma: no cover - """Check whether there are conflicting modules, and display warnings.""" - # See https://github.com/pytorch/pytorch/issues/78829: importing - # pytorch before running `pysr.fit` causes a segfault. - torch_is_loaded = "torch" in sys.modules - if torch_is_loaded: - warnings.warn( - "`torch` was loaded before the Julia instance started. " - "This may cause a segfault when running `PySRRegressor.fit`. " - "To avoid this, please run `pysr.julia_helpers.init_julia()` *before* " - "importing `torch`. " - "For updates, see https://github.com/pytorch/pytorch/issues/78829" - ) - - -def init_julia(julia_project=None, quiet=False, julia_kwargs=None, return_aux=False): - """Initialize julia binary, turning off compiled modules if needed.""" - global julia_initialized - global julia_kwargs_at_initialization - global julia_activated_env - - if not julia_initialized: - _check_for_conflicting_libraries() - - if julia_kwargs is None: - julia_kwargs = {"optimize": 3} - - from julia.core import JuliaInfo, UnsupportedPythonError - - _julia_version_assertion() - processed_julia_project, is_shared = _process_julia_project(julia_project) - _set_julia_project_env(processed_julia_project, is_shared) - - try: - info = JuliaInfo.load(julia="julia") - except FileNotFoundError: - env_path = os.environ["PATH"] - raise FileNotFoundError( - f"Julia is not installed in your PATH. Please install Julia and add it to your PATH.\n\nCurrent PATH: {env_path}", - ) - - if not info.is_pycall_built(): - raise ImportError(_import_error()) - - from julia.core import Julia - - try: - Julia(**julia_kwargs) - except UnsupportedPythonError: - # Static python binary, so we turn off pre-compiled modules. - julia_kwargs = {**julia_kwargs, "compiled_modules": False} - Julia(**julia_kwargs) - warnings.warn( - "Your system's Python library is static (e.g., conda), so precompilation will be turned off. For a dynamic library, try using `pyenv` and installing with `--enable-shared`: https://github.com/pyenv/pyenv/blob/master/plugins/python-build/README.md#building-with---enable-shared." - ) - - using_compiled_modules = (not "compiled_modules" in julia_kwargs) or julia_kwargs[ - "compiled_modules" - ] - - from julia import Main as _Main - - Main = _Main - - if julia_activated_env is None: - julia_activated_env = processed_julia_project - - if julia_initialized and julia_kwargs_at_initialization is not None: - # Check if the kwargs are the same as the previous initialization - init_set = set(julia_kwargs_at_initialization.items()) - new_set = set(julia_kwargs.items()) - set_diff = new_set - init_set - # Remove the `compiled_modules` key, since it is not a user-specified kwarg: - set_diff = {k: v for k, v in set_diff if k != "compiled_modules"} - if len(set_diff) > 0: - warnings.warn( - "Julia has already started. The new Julia options " - + str(set_diff) - + " will be ignored." - ) - - if julia_initialized and julia_activated_env != processed_julia_project: - Main.eval("using Pkg") - - io_arg = _get_io_arg(quiet) - # Can't pass IO to Julia call as it evaluates to PyObject, so just directly - # use Main.eval: - Main.eval( - f'Pkg.activate("{_escape_filename(processed_julia_project)}",' - f"shared = Bool({int(is_shared)}), " - f"{io_arg})" - ) - - julia_activated_env = processed_julia_project - - if not julia_initialized: - julia_kwargs_at_initialization = julia_kwargs - - julia_initialized = True - if return_aux: - return Main, {"compiled_modules": using_compiled_modules} - return Main - - -def _add_sr_to_julia_project(Main, io_arg): - Main.eval("using Pkg") - Main.eval("Pkg.Registry.update()") - Main.sr_spec = Main.PackageSpec( - name="SymbolicRegression", - url="https://github.com/MilesCranmer/SymbolicRegression.jl", - rev="v" + __symbolic_regression_jl_version__, - ) - Main.clustermanagers_spec = Main.PackageSpec( - name="ClusterManagers", - version="0.4", - ) - Main.eval(f"Pkg.add([sr_spec, clustermanagers_spec], {io_arg})") +jl.seval("using SymbolicRegression: plus, sub, mult, div, pow") def _escape_filename(filename): @@ -288,60 +22,27 @@ def _escape_filename(filename): return str_repr -def _julia_version_assertion(): - if not is_julia_version_greater_eq(version=(1, 6, 0)): - raise NotImplementedError( - "PySR requires Julia 1.6.0 or greater. " - "Please update your Julia installation." - ) - - -def _backend_version_assertion(Main): - try: - backend_version = Main.eval("string(SymbolicRegression.PACKAGE_VERSION)") - expected_backend_version = __symbolic_regression_jl_version__ - if backend_version != expected_backend_version: # pragma: no cover - warnings.warn( - f"PySR backend (SymbolicRegression.jl) version {backend_version} " - f"does not match expected version {expected_backend_version}. " - "Things may break. " - "Please update your PySR installation with " - "`python3 -m pysr install`." - ) - except JuliaError: # pragma: no cover - warnings.warn( - "You seem to have an outdated version of SymbolicRegression.jl. " - "Things may break. " - "Please update your PySR installation with " - "`python3 -m pysr install`." - ) - - -def _load_cluster_manager(Main, cluster_manager): - Main.eval(f"import ClusterManagers: addprocs_{cluster_manager}") - return Main.eval(f"addprocs_{cluster_manager}") - +def _load_cluster_manager(cluster_manager): + jl.seval(f"using ClusterManagers: addprocs_{cluster_manager}") + return jl.seval(f"addprocs_{cluster_manager}") -def _update_julia_project(Main, is_shared, io_arg): - try: - if is_shared: - _add_sr_to_julia_project(Main, io_arg) - Main.eval("using Pkg") - Main.eval(f"Pkg.resolve({io_arg})") - except (JuliaError, RuntimeError) as e: - raise ImportError(_import_error()) from e +def jl_array(x): + if x is None: + return None + return jl_convert(jl.Array, x) -def _load_backend(Main): - try: - # Load namespace, so that various internal operators work: - Main.eval("using SymbolicRegression") - except (JuliaError, RuntimeError) as e: - raise ImportError(_import_error()) from e - _backend_version_assertion(Main) +def jl_serialize(obj): + buf = jl.IOBuffer() + Serialization.serialize(buf, obj) + return np.array(jl.take_b(buf)) - # Load Julia package SymbolicRegression.jl - from julia import SymbolicRegression - return SymbolicRegression +def jl_deserialize(s): + if s is None: + return s + buf = jl.IOBuffer() + jl.write(buf, jl_array(s)) + jl.seekstart(buf) + return Serialization.deserialize(buf) diff --git a/pysr/julia_import.py b/pysr/julia_import.py new file mode 100644 index 0000000000000000000000000000000000000000..3e588d3b7b39a159beae949609b866500220bc21 --- /dev/null +++ b/pysr/julia_import.py @@ -0,0 +1,76 @@ +import os +import sys +import warnings + +# Check if JuliaCall is already loaded, and if so, warn the user +# about the relevant environment variables. If not loaded, +# set up sensible defaults. +if "juliacall" in sys.modules: + warnings.warn( + "juliacall module already imported. " + "Make sure that you have set the environment variable `PYTHON_JULIACALL_HANDLE_SIGNALS=yes` to avoid segfaults. " + "Also note that PySR will not be able to configure `PYTHON_JULIACALL_THREADS` or `PYTHON_JULIACALL_OPTLEVEL` for you." + ) +else: + # Required to avoid segfaults (https://juliapy.github.io/PythonCall.jl/dev/faq/) + if os.environ.get("PYTHON_JULIACALL_HANDLE_SIGNALS", "yes") != "yes": + warnings.warn( + "PYTHON_JULIACALL_HANDLE_SIGNALS environment variable is set to something other than 'yes' or ''. " + + "You will experience segfaults if running with multithreading." + ) + + if os.environ.get("PYTHON_JULIACALL_THREADS", "auto") != "auto": + warnings.warn( + "PYTHON_JULIACALL_THREADS environment variable is set to something other than 'auto', " + "so PySR was not able to set it. You may wish to set it to `'auto'` for full use " + "of your CPU." + ) + + # TODO: Remove these when juliapkg lets you specify this + for k, default in ( + ("PYTHON_JULIACALL_HANDLE_SIGNALS", "yes"), + ("PYTHON_JULIACALL_THREADS", "auto"), + ("PYTHON_JULIACALL_OPTLEVEL", "3"), + ): + os.environ[k] = os.environ.get(k, default) + + +from juliacall import Main as jl # type: ignore + +# Overwrite the seval function to use Meta.parseall +# instead of Meta.parse. +jl.seval("using PythonCall: PythonCall, Py, pyconvert") +jl.seval( + """function PythonCall.pyjlmodule_seval(self::Module, expr::Py) + e = Meta.parseall(strip(pyconvert(String, expr))) + Py(Base.eval(self, e)) +end""" +) +# ^TODO: Overwrite this once PythonCall.jl is updated: + +jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch) + +# Next, automatically load the juliacall extension if we're in a Jupyter notebook +autoload_extensions = os.environ.get("PYSR_AUTOLOAD_EXTENSIONS", "yes") +if autoload_extensions in {"yes", ""} and jl_version >= (1, 9, 0): + try: + get_ipython = sys.modules["IPython"].get_ipython + + if "IPKernelApp" not in get_ipython().config: + raise ImportError("console") + + print( + "Detected Jupyter notebook. Loading juliacall extension. Set `PYSR_AUTOLOAD_EXTENSIONS=no` to disable." + ) + + # TODO: Turn this off if juliacall does this automatically + get_ipython().run_line_magic("load_ext", "juliacall") + except Exception: + pass +elif autoload_extensions not in {"no", "yes", ""}: + warnings.warn( + "PYSR_AUTOLOAD_EXTENSIONS environment variable is set to something other than 'yes' or 'no' or ''." + ) + +jl.seval("using SymbolicRegression") +SymbolicRegression = jl.SymbolicRegression diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json new file mode 100644 index 0000000000000000000000000000000000000000..a578fdff8218d193453f2111de1a1362522ae53a --- /dev/null +++ b/pysr/juliapkg.json @@ -0,0 +1,21 @@ +{ + "julia": "1.6", + "packages": { + "SymbolicRegression": { + "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", + "version": "=0.23.1" + }, + "ClusterManagers": { + "uuid": "34f1f09b-3a8b-5176-ab39-66d58a4d544e", + "version": "0.4" + }, + "Serialization": { + "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b", + "version": "1" + }, + "Zygote": { + "uuid": "e88e6eb3-aa80-5325-afca-941959d7151f", + "version": "0.6" + } + } +} diff --git a/pysr/param_groupings.yml b/pysr/param_groupings.yml index 145c6b054a8d369f2032ad0288618e6fd4f576ff..a875c90a9c3e6ac8a2c75374817c9336bfa84023 100644 --- a/pysr/param_groupings.yml +++ b/pysr/param_groupings.yml @@ -8,10 +8,10 @@ - niterations - populations - population_size - - ncyclesperiteration + - ncycles_per_iteration - The Objective: - - loss - - full_objective + - elementwise_loss + - loss_function - model_selection - dimensional_constraint_penalty - Working with Complexities: @@ -88,9 +88,7 @@ - temp_equation_file - tempdir - delete_tempfiles - - julia_project - update - - julia_kwargs - Exporting the Results: - equation_file - output_jax_format diff --git a/pysr/sklearn_monkeypatch.py b/pysr/sklearn_monkeypatch.py index 2d7d1dbe893cbacb4eec8c3b235958d77da6e227..703767e4e296c37a9bb7c11f675c3b3d025abeed 100644 --- a/pysr/sklearn_monkeypatch.py +++ b/pysr/sklearn_monkeypatch.py @@ -9,5 +9,5 @@ def _ensure_no_complex_data(*args, **kwargs): try: validation._ensure_no_complex_data = _ensure_no_complex_data -except AttributeError: +except AttributeError: # pragma: no cover ... diff --git a/pysr/sr.py b/pysr/sr.py index 8bddece191dd233208ec3f7ffdec6b35374133ca..6326abd3521d521a243b36e0d6919e576077fd19 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -25,7 +25,7 @@ from sklearn.utils import check_array, check_consistent_length, check_random_sta from sklearn.utils.validation import _check_feature_names_in, check_is_fitted from .denoising import denoise, multi_denoise -from .deprecated import make_deprecated_kwargs_for_pysr_regressor +from .deprecated import DEPRECATED_KWARGS from .export_jax import sympy2jax from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable from .export_numpy import sympy2numpy @@ -33,14 +33,14 @@ from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2 from .export_torch import sympy2torch from .feature_selection import run_feature_selection from .julia_helpers import ( + PythonCall, _escape_filename, - _load_backend, _load_cluster_manager, - _process_julia_project, - _update_julia_project, - init_julia, - is_julia_version_greater_eq, + jl_array, + jl_deserialize, + jl_serialize, ) +from .julia_import import SymbolicRegression, jl from .utils import ( _csv_filename_to_pkl_filename, _preprocess_julia_floats, @@ -48,8 +48,6 @@ from .utils import ( _subscriptify, ) -Main = None # TODO: Rename to more descriptive name like "julia_runtime" - already_ran = False @@ -92,7 +90,6 @@ def _process_constraints(binary_operators, unary_operators, constraints): def _maybe_create_inline_operators( binary_operators, unary_operators, extra_sympy_mappings ): - global Main binary_operators = binary_operators.copy() unary_operators = unary_operators.copy() for op_list in [binary_operators, unary_operators]: @@ -100,7 +97,7 @@ def _maybe_create_inline_operators( is_user_defined_operator = "(" in op if is_user_defined_operator: - Main.eval(op) + jl.seval(op) # Cut off from the first non-alphanumeric char: first_non_char = [j for j, char in enumerate(op) if char == "("][0] function_name = op[:first_non_char] @@ -271,7 +268,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): arguments are treated the same way, and the max of each argument is constrained. Default is `None`. - loss : str + elementwise_loss : str String of Julia code specifying an elementwise loss function. Can either be a loss from LossFunctions.jl, or your own loss written as a function. Examples of custom written losses include: @@ -287,11 +284,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, `SigmoidLoss()`, `DWDMarginLoss(q)`. Default is `"L2DistLoss()"`. - full_objective : str + loss_function : str Alternatively, you can specify the full objective function as a snippet of Julia code, including any sort of custom evaluation (including symbolic manipulations beforehand), and any sort - of loss function or regularizations. The default `full_objective` + of loss function or regularizations. The default `loss_function` used in SymbolicRegression.jl is roughly equal to: ```julia function eval_loss(tree, dataset::Dataset{T,L}, options)::L where {T,L} @@ -357,7 +354,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): takes a loss and complexity as input, for example: `"f(loss, complexity) = (loss < 0.1) && (complexity < 10)"`. Default is `None`. - ncyclesperiteration : int + ncycles_per_iteration : int Number of total mutations to run, per 10 samples of the population, per iteration. Default is `550`. @@ -401,7 +398,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Constant optimization can also be performed as a mutation, in addition to the normal strategy controlled by `optimize_probability` which happens every iteration. Using it as a mutation is useful if you want to use - a large `ncyclesperiteration`, and may not optimize very often. + a large `ncycles_periteration`, and may not optimize very often. Default is `0.0`. crossover_probability : float Absolute probability of crossover-type genetic operation, instead of a mutation. @@ -536,11 +533,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): delete_tempfiles : bool Whether to delete the temporary files after finishing. Default is `True`. - julia_project : str - A Julia environment location containing a Project.toml - (and potentially the source code for SymbolicRegression.jl). - Default gives the Python package directory, where a - Project.toml file should be present from the install. update: bool Whether to automatically update Julia packages when `fit` is called. You should make sure that PySR is up-to-date itself first, as @@ -585,11 +577,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): before passing to the symbolic regression code. None means no feature selection; an int means select that many features. Default is `None`. - julia_kwargs : dict - Keyword arguments to pass to `julia.core.Julia(...)` to initialize - the Julia runtime. The default, when `None`, is to set `threads` equal - to `procs`, and `optimize` to 3. - Default is `None`. **kwargs : dict Supports deprecated keyword arguments. Other arguments will result in an error. @@ -617,8 +604,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Path to the temporary equations directory. equation_file_ : str Output equation file name produced by the julia backend. - raw_julia_state_ : tuple[list[PyCall.jlwrap], PyCall.jlwrap] - The state for the julia SymbolicRegression.jl backend post fitting. + julia_state_stream_ : ndarray + The serialized state for the julia SymbolicRegression.jl backend (after fitting), + stored as an array of uint8, produced by Julia's Serialization.serialize function. + julia_state_ + The deserialized state. + julia_options_stream_ : ndarray + The serialized julia options, stored as an array of uint8, + julia_options_ + The deserialized julia options. equation_file_contents_ : list[pandas.DataFrame] Contents of the equation file output by the Julia backend. show_pickle_warnings_ : bool @@ -643,7 +637,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): ... "inv(x) = 1/x", # Custom operator (julia syntax) ... ], ... model_selection="best", - ... loss="loss(x, y) = (x - y)^2", # Custom loss function (julia syntax) + ... elementwise_loss="loss(x, y) = (x - y)^2", # Custom loss function (julia syntax) ... ) >>> model.fit(X, y) >>> model @@ -681,8 +675,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): timeout_in_seconds: Optional[float] = None, constraints: Optional[Dict[str, Union[int, Tuple[int, int]]]] = None, nested_constraints: Optional[Dict[str, Dict[str, int]]] = None, - loss: Optional[str] = None, - full_objective: Optional[str] = None, + elementwise_loss: Optional[str] = None, + loss_function: Optional[str] = None, complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None, complexity_of_constants: Union[int, float] = 1, complexity_of_variables: Union[int, float] = 1, @@ -694,7 +688,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): alpha: float = 0.1, annealing: bool = False, early_stop_condition: Optional[Union[float, str]] = None, - ncyclesperiteration: int = 550, + ncycles_per_iteration: int = 550, fraction_replaced: float = 0.000364, fraction_replaced_hof: float = 0.035, weight_add_node: float = 0.79, @@ -744,7 +738,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): temp_equation_file: bool = False, tempdir: Optional[str] = None, delete_tempfiles: bool = True, - julia_project: Optional[str] = None, update: bool = False, output_jax_format: bool = False, output_torch_format: bool = False, @@ -753,7 +746,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): extra_jax_mappings: Optional[Dict[Callable, str]] = None, denoise: bool = False, select_k_features: Optional[int] = None, - julia_kwargs: Optional[Dict] = None, **kwargs, ): # Hyperparameters @@ -764,7 +756,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): self.niterations = niterations self.populations = populations self.population_size = population_size - self.ncyclesperiteration = ncyclesperiteration + self.ncycles_per_iteration = ncycles_per_iteration # - Equation Constraints self.maxsize = maxsize self.maxdepth = maxdepth @@ -777,8 +769,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): self.timeout_in_seconds = timeout_in_seconds self.early_stop_condition = early_stop_condition # - Loss parameters - self.loss = loss - self.full_objective = full_objective + self.elementwise_loss = elementwise_loss + self.loss_function = loss_function self.complexity_of_operators = complexity_of_operators self.complexity_of_constants = complexity_of_constants self.complexity_of_variables = complexity_of_variables @@ -844,7 +836,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): self.temp_equation_file = temp_equation_file self.tempdir = tempdir self.delete_tempfiles = delete_tempfiles - self.julia_project = julia_project self.update = update self.output_jax_format = output_jax_format self.output_torch_format = output_torch_format @@ -854,16 +845,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): # Pre-modelling transformation self.denoise = denoise self.select_k_features = select_k_features - self.julia_kwargs = julia_kwargs # Once all valid parameters have been assigned handle the # deprecated kwargs if len(kwargs) > 0: # pragma: no cover - deprecated_kwargs = make_deprecated_kwargs_for_pysr_regressor() for k, v in kwargs.items(): # Handle renamed kwargs - if k in deprecated_kwargs: - updated_kwarg_name = deprecated_kwargs[k] + if k in DEPRECATED_KWARGS: + updated_kwarg_name = DEPRECATED_KWARGS[k] setattr(self, updated_kwarg_name, v) warnings.warn( f"{k} has been renamed to {updated_kwarg_name} in PySRRegressor. " @@ -877,6 +866,19 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): f"Ignoring parameter; please pass {k} during the call to fit instead.", FutureWarning, ) + elif k == "julia_project": + warnings.warn( + "The `julia_project` parameter has been deprecated. To use a custom " + "julia project, please see `https://astroautomata.com/PySR/backend`.", + FutureWarning, + ) + elif k == "julia_kwargs": + warnings.warn( + "The `julia_kwargs` parameter has been deprecated. To pass custom " + "keyword arguments to the julia backend, you should use environment variables. " + "See the Julia documentation for more information.", + FutureWarning, + ) else: raise TypeError( f"{k} is not a valid keyword argument for PySRRegressor." @@ -1051,7 +1053,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): serialization. Thus, for `PySRRegressor` to support pickle serialization, the - `raw_julia_state_` attribute must be hidden from pickle. This will + `julia_state_stream_` attribute must be hidden from pickle. This will prevent the `warm_start` of any model that is loaded via `pickle.loads()`, but does allow all other attributes of a fitted `PySRRegressor` estimator to be serialized. Note: Jax and Torch format equations are also removed @@ -1061,12 +1063,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): show_pickle_warning = not ( "show_pickle_warnings_" in state and not state["show_pickle_warnings_"] ) - if "raw_julia_state_" in state and show_pickle_warning: - warnings.warn( - "raw_julia_state_ cannot be pickled and will be removed from the " - "serialized instance. This will prevent a `warm_start` fit of any " - "model that is deserialized via `pickle.load()`." - ) state_keys_containing_lambdas = ["extra_sympy_mappings", "extra_torch_mappings"] for state_key in state_keys_containing_lambdas: if state[state_key] is not None and show_pickle_warning: @@ -1075,7 +1071,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): "serialized instance. When loading the model, please redefine " f"`{state_key}` at runtime." ) - state_keys_to_clear = ["raw_julia_state_"] + state_keys_containing_lambdas + state_keys_to_clear = state_keys_containing_lambdas pickled_state = { key: (None if key in state_keys_to_clear else value) for key, value in state.items() @@ -1125,6 +1121,24 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): ) return self.equations_ + @property + def julia_options_(self): + return jl_deserialize(self.julia_options_stream_) + + @property + def julia_state_(self): + return jl_deserialize(self.julia_state_stream_) + + @property + def raw_julia_state_(self): + warnings.warn( + "PySRRegressor.raw_julia_state_ is now deprecated. " + "Please use PySRRegressor.julia_state_ instead, or julia_state_stream_ " + "for the raw stream of bytes.", + FutureWarning, + ) + return self.julia_state_ + def get_best(self, index=None): """ Get best equation using `model_selection`. @@ -1238,8 +1252,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): "to True and `procs` to 0 will result in non-deterministic searches. " ) - if self.loss is not None and self.full_objective is not None: - raise ValueError("You cannot set both `loss` and `full_objective`.") + if self.elementwise_loss is not None and self.loss_function is not None: + raise ValueError( + "You cannot set both `elementwise_loss` and `loss_function`." + ) # NotImplementedError - Values that could be supported at a later time if self.optimizer_algorithm not in VALID_OPTIMIZER_ALGORITHMS: @@ -1291,16 +1307,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): > 0 ) - julia_kwargs = {} - if self.julia_kwargs is not None: - for key, value in self.julia_kwargs.items(): - julia_kwargs[key] = value - if "optimize" not in julia_kwargs: - julia_kwargs["optimize"] = 3 - if "threads" not in julia_kwargs and packed_modified_params["multithreading"]: - julia_kwargs["threads"] = self.procs - packed_modified_params["julia_kwargs"] = julia_kwargs - return packed_modified_params def _validate_and_set_fit_params( @@ -1528,7 +1534,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): # Need to be global as we don't want to recreate/reinstate julia for # every new instance of PySRRegressor global already_ran - global Main # These are the parameters which may be modified from the ones # specified in init, so we define them here locally: @@ -1543,32 +1548,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): batch_size = mutated_params["batch_size"] update_verbosity = mutated_params["update_verbosity"] progress = mutated_params["progress"] - julia_kwargs = mutated_params["julia_kwargs"] # Start julia backend processes if not already_ran and update_verbosity != 0: print("Compiling Julia backend...") - Main = init_julia(self.julia_project, julia_kwargs=julia_kwargs) - if cluster_manager is not None: - cluster_manager = _load_cluster_manager(Main, cluster_manager) - - if self.update: - _, is_shared = _process_julia_project(self.julia_project) - io = "devnull" if update_verbosity == 0 else "stderr" - io_arg = ( - f"io={io}" if is_julia_version_greater_eq(version=(1, 6, 0)) else "" - ) - _update_julia_project(Main, is_shared, io_arg) - - SymbolicRegression = _load_backend(Main) - - Main.plus = Main.eval("(+)") - Main.sub = Main.eval("(-)") - Main.mult = Main.eval("(*)") - Main.pow = Main.eval("(^)") - Main.div = Main.eval("(/)") + cluster_manager = _load_cluster_manager(cluster_manager) # TODO(mcranmer): These functions should be part of this class. binary_operators, unary_operators = _maybe_create_inline_operators( @@ -1594,7 +1580,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): nested_constraints_str += f"({inner_k}) => {inner_v}, " nested_constraints_str += "), " nested_constraints_str += ")" - nested_constraints = Main.eval(nested_constraints_str) + nested_constraints = jl.seval(nested_constraints_str) # Parse dict into Julia Dict for complexities: if complexity_of_operators is not None: @@ -1602,13 +1588,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): for k, v in complexity_of_operators.items(): complexity_of_operators_str += f"({k}) => {v}, " complexity_of_operators_str += ")" - complexity_of_operators = Main.eval(complexity_of_operators_str) + complexity_of_operators = jl.seval(complexity_of_operators_str) - custom_loss = Main.eval(self.loss) - custom_full_objective = Main.eval(self.full_objective) + custom_loss = jl.seval( + str(self.elementwise_loss) + if self.elementwise_loss is not None + else "nothing" + ) + custom_full_objective = jl.seval( + str(self.loss_function) if self.loss_function is not None else "nothing" + ) - early_stop_condition = Main.eval( - str(self.early_stop_condition) if self.early_stop_condition else None + early_stop_condition = jl.seval( + str(self.early_stop_condition) + if self.early_stop_condition is not None + else "nothing" ) mutation_weights = SymbolicRegression.MutationWeights( @@ -1627,10 +1621,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): # Call to Julia backend. # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl options = SymbolicRegression.Options( - binary_operators=Main.eval(str(binary_operators).replace("'", "")), - unary_operators=Main.eval(str(unary_operators).replace("'", "")), - bin_constraints=bin_constraints, - una_constraints=una_constraints, + binary_operators=jl.seval(str(binary_operators).replace("'", "")), + unary_operators=jl.seval(str(unary_operators).replace("'", "")), + bin_constraints=jl_array(bin_constraints), + una_constraints=jl_array(una_constraints), complexity_of_operators=complexity_of_operators, complexity_of_constants=self.complexity_of_constants, complexity_of_variables=self.complexity_of_variables, @@ -1665,7 +1659,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): use_frequency_in_tournament=self.use_frequency_in_tournament, adaptive_parsimony_scaling=self.adaptive_parsimony_scaling, npop=self.population_size, - ncycles_per_iteration=self.ncyclesperiteration, + ncycles_per_iteration=self.ncycles_per_iteration, fraction_replaced=self.fraction_replaced, topn=self.topn, print_precision=self.print_precision, @@ -1685,6 +1679,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): define_helper_functions=False, ) + self.julia_options_stream_ = jl_serialize(options) + # Convert data to desired precision test_X = np.array(X) is_complex = np.issubdtype(test_X.dtype, np.complexfloating) @@ -1695,18 +1691,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): np_dtype = {32: np.complex64, 64: np.complex128}[self.precision] # This converts the data into a Julia array: - Main.X = np.array(X, dtype=np_dtype).T + jl_X = jl_array(np.array(X, dtype=np_dtype).T) if len(y.shape) == 1: - Main.y = np.array(y, dtype=np_dtype) + jl_y = jl_array(np.array(y, dtype=np_dtype)) else: - Main.y = np.array(y, dtype=np_dtype).T + jl_y = jl_array(np.array(y, dtype=np_dtype).T) if weights is not None: if len(weights.shape) == 1: - Main.weights = np.array(weights, dtype=np_dtype) + jl_weights = jl_array(np.array(weights, dtype=np_dtype)) else: - Main.weights = np.array(weights, dtype=np_dtype).T + jl_weights = jl_array(np.array(weights, dtype=np_dtype).T) else: - Main.weights = None + jl_weights = None if self.procs == 0 and not multithreading: parallelism = "serial" @@ -1719,34 +1715,41 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): None if parallelism in ["serial", "multithreading"] else int(self.procs) ) - y_variable_names = None if len(y.shape) > 1: # We set these manually so that they respect Python's 0 indexing # (by default Julia will use y1, y2...) - y_variable_names = [f"y{_subscriptify(i)}" for i in range(y.shape[1])] + jl_y_variable_names = jl_array( + [f"y{_subscriptify(i)}" for i in range(y.shape[1])] + ) + else: + jl_y_variable_names = None - # Call to Julia backend. - # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl - self.raw_julia_state_ = SymbolicRegression.equation_search( - Main.X, - Main.y, - weights=Main.weights, + PythonCall.GC.disable() + out = SymbolicRegression.equation_search( + jl_X, + jl_y, + weights=jl_weights, niterations=int(self.niterations), - variable_names=self.feature_names_in_.tolist(), - display_variable_names=self.display_feature_names_in_.tolist(), - y_variable_names=y_variable_names, - X_units=self.X_units_, - y_units=self.y_units_, + variable_names=jl_array([str(v) for v in self.feature_names_in_]), + display_variable_names=jl_array( + [str(v) for v in self.display_feature_names_in_] + ), + y_variable_names=jl_y_variable_names, + X_units=jl_array(self.X_units_), + y_units=jl_array(self.y_units_), options=options, numprocs=cprocs, parallelism=parallelism, - saved_state=self.raw_julia_state_, + saved_state=self.julia_state_, return_state=True, addprocs_function=cluster_manager, heap_size_hint_in_bytes=self.heap_size_hint_in_bytes, progress=progress and self.verbosity > 0 and len(y.shape) == 1, verbosity=int(self.verbosity), ) + PythonCall.GC.enable() + + self.julia_state_stream_ = jl_serialize(out) # Set attributes self.equations_ = self.get_hof() @@ -1810,10 +1813,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Fitted estimator. """ # Init attributes that are not specified in BaseEstimator - if self.warm_start and hasattr(self, "raw_julia_state_"): + if self.warm_start and hasattr(self, "julia_state_stream_"): pass else: - if hasattr(self, "raw_julia_state_"): + if hasattr(self, "julia_state_stream_"): warnings.warn( "The discovered expressions are being reset. " "Please set `warm_start=True` if you wish to continue " @@ -1823,7 +1826,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): self.equations_ = None self.nout_ = 1 self.selection_mask_ = None - self.raw_julia_state_ = None + self.julia_state_stream_ = None + self.julia_options_stream_ = None self.X_units_ = None self.y_units_ = None diff --git a/pysr/test/__init__.py b/pysr/test/__init__.py index 7b176eab67718b614d8226ef6572871b97b2211a..cb6b9e4a396dc29dad0b59d047d88c0dcf7d2ac2 100644 --- a/pysr/test/__init__.py +++ b/pysr/test/__init__.py @@ -1,7 +1,15 @@ from .test import runtests -from .test_cli import runtests as runtests_cli -from .test_env import runtests as runtests_env +from .test_cli import get_runtests as get_runtests_cli +from .test_dev import runtests as runtests_dev from .test_jax import runtests as runtests_jax +from .test_startup import runtests as runtests_startup from .test_torch import runtests as runtests_torch -__all__ = ["runtests", "runtests_env", "runtests_jax", "runtests_torch", "runtests_cli"] +__all__ = [ + "runtests", + "runtests_jax", + "runtests_torch", + "get_runtests_cli", + "runtests_startup", + "runtests_dev", +] diff --git a/pysr/test/__main__.py b/pysr/test/__main__.py index b0ec3b36db6e2730b70b9d1807d8383eba61d4bd..d0fca112e52d7ff0a0f865778f1daad3981b9926 100644 --- a/pysr/test/__main__.py +++ b/pysr/test/__main__.py @@ -1,43 +1,13 @@ """CLI for running PySR's test suite.""" import argparse -import os from . import * if __name__ == "__main__": # Get args: parser = argparse.ArgumentParser() - parser.usage = "python -m pysr.test [tests...]" parser.add_argument( "test", nargs="*", - help="Test to run. One or more of 'main', 'env', 'jax', 'torch', 'cli'.", + help="DEPRECATED. Use `python -m pysr test [tests...]` instead.", ) - - # Parse args: - args = parser.parse_args() - tests = args.test - - if len(tests) == 0: - # Raise help message: - parser.print_help() - raise SystemExit(1) - - # Run tests: - for test in tests: - if test in {"main", "env", "jax", "torch", "cli"}: - cur_dir = os.path.dirname(os.path.abspath(__file__)) - print(f"Running test from {cur_dir}") - if test == "main": - runtests() - elif test == "env": - runtests_env() - elif test == "jax": - runtests_jax() - elif test == "torch": - runtests_torch() - elif test == "cli": - runtests_cli() - else: - parser.print_help() - raise SystemExit(1) diff --git a/pysr/test/generate_dev_juliapkg.py b/pysr/test/generate_dev_juliapkg.py new file mode 100644 index 0000000000000000000000000000000000000000..6059cfc927cdc10ad6bc6a73ee5345a8723fdafa --- /dev/null +++ b/pysr/test/generate_dev_juliapkg.py @@ -0,0 +1,17 @@ +# Example call: +## python3 generate_dev_juliapkg.py /pysr/pysr/juliapkg.json /srjl +import json +import sys + +juliapkg_json = sys.argv[1] +path_to_srjl = sys.argv[2] + +with open(juliapkg_json, "r") as f: + juliapkg = json.load(f) + +del juliapkg["packages"]["SymbolicRegression"]["version"] +juliapkg["packages"]["SymbolicRegression"]["path"] = path_to_srjl +juliapkg["packages"]["SymbolicRegression"]["dev"] = True + +with open(juliapkg_json, "w") as f: + json.dump(juliapkg, f, indent=4) diff --git a/pysr/test/incremental_install_simulator.dockerfile b/pysr/test/incremental_install_simulator.dockerfile deleted file mode 100644 index 62811e8c3ebf181ae4649e9437916051a6f24c0d..0000000000000000000000000000000000000000 --- a/pysr/test/incremental_install_simulator.dockerfile +++ /dev/null @@ -1,52 +0,0 @@ -# This dockerfile simulates a user installation that first -# builds PySR for Python 3.9, and then upgrades to Python 3.10. -# Normally this would cause an error when installing PyCall, so we want to -# ensure that PySR can automatically patch things. -FROM debian:bullseye-slim - -ENV DEBIAN_FRONTEND=noninteractive - -# Install juliaup and pyenv: -RUN apt-get update && apt-get install -y curl git build-essential \ - libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev \ - libncurses5-dev libncursesw5-dev xz-utils libffi-dev liblzma-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Install juliaup: -RUN curl -fsSL https://install.julialang.org | sh -s -- -y - -# Install pyenv: -RUN curl -fsSL curl https://pyenv.run | sh && \ - echo 'export PATH="/root/.pyenv/bin:$PATH"' >> ~/.bashrc && \ - echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \ - echo 'eval "$(pyenv init -)"' >> ~/.bashrc && \ - echo 'eval "$(pyenv virtualenv-init -)"' >> ~/.bashrc - -# Default to using bash -l: -SHELL ["/bin/bash", "-l", "-c"] - -RUN juliaup add 1.8 && juliaup default 1.8 -RUN pyenv install 3.9.2 && pyenv global 3.9.2 -RUN python3 -m pip install --upgrade pip - -# Get PySR source: -WORKDIR /pysr -ADD ./requirements.txt /pysr/requirements.txt -RUN python3 -m pip install -r /pysr/requirements.txt - -ADD ./setup.py /pysr/setup.py -ADD ./pysr/ /pysr/pysr/ - -# First install of PySR: -RUN python3 -m pip install . -RUN python3 -m pysr install - -# Change Python version: -RUN pyenv install 3.10 && pyenv global 3.10 && pyenv uninstall -f 3.9.2 -RUN python3 -m pip install --upgrade pip - -# Second install of PySR: -RUN python3 -m pip install . -RUN rm -r ~/.julia/environments/pysr-* -RUN python3 -m pysr install diff --git a/pysr/test/nb_sanitize.cfg b/pysr/test/nb_sanitize.cfg new file mode 100644 index 0000000000000000000000000000000000000000..caabeb6ab04e781509c5be1daef5a0b56f2ef29e --- /dev/null +++ b/pysr/test/nb_sanitize.cfg @@ -0,0 +1,3 @@ +[pathnames] +regex: /[a-zA-Z0-9_\- .\/]+/pysr/sr\.py +replace: PATH diff --git a/pysr/test/params.py b/pysr/test/params.py new file mode 100644 index 0000000000000000000000000000000000000000..9850c9cdf1b53c415e8d40dc728cd2fe4f97f52a --- /dev/null +++ b/pysr/test/params.py @@ -0,0 +1,8 @@ +import inspect + +from .. import PySRRegressor + +DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters +DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default +DEFAULT_POPULATIONS = DEFAULT_PARAMS["populations"].default +DEFAULT_NCYCLES = DEFAULT_PARAMS["ncycles_per_iteration"].default diff --git a/pysr/test/test.py b/pysr/test/test.py index 120f34d08c503d949bac77171060ab207bc5cb7a..e50d0bd37782b07b49ae181b558e4b73bc6dcb4f 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -1,4 +1,3 @@ -import inspect import os import pickle as pkl import tempfile @@ -12,16 +11,18 @@ import pandas as pd import sympy from sklearn.utils.estimator_checks import check_estimator -from .. import PySRRegressor, julia_helpers +from .. import PySRRegressor, install, jl from ..export_latex import sympy2latex from ..feature_selection import _handle_feature_selection, run_feature_selection +from ..julia_helpers import init_julia from ..sr import _check_assertions, _process_constraints, idx_model_selection from ..utils import _csv_filename_to_pkl_filename - -DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters -DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default -DEFAULT_POPULATIONS = DEFAULT_PARAMS["populations"].default -DEFAULT_NCYCLES = DEFAULT_PARAMS["ncyclesperiteration"].default +from .params import ( + DEFAULT_NCYCLES, + DEFAULT_NITERATIONS, + DEFAULT_PARAMS, + DEFAULT_POPULATIONS, +) class TestPipeline(unittest.TestCase): @@ -80,7 +81,7 @@ class TestPipeline(unittest.TestCase): multithreading=False, turbo=True, early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 1", - full_objective=""" + loss_function=""" function my_objective(tree::Node{T}, dataset::Dataset{T}, options::Options) where T prediction, flag = eval_tree_array(tree, dataset.X, options) !flag && return T(Inf) @@ -95,22 +96,39 @@ class TestPipeline(unittest.TestCase): self.assertLessEqual(best_loss, 1e-10) self.assertGreaterEqual(best_loss, 0.0) + # Test options stored: + self.assertEqual(model.julia_options_.turbo, True) + + def test_multiline_seval(self): + # The user should be able to run multiple things in a single seval call: + num = jl.seval( + """ + function my_new_objective(x) + x^2 + end + 1.5 + """ + ) + self.assertEqual(num, 1.5) + def test_high_precision_search_custom_loss(self): y = 1.23456789 * self.X[:, 0] model = PySRRegressor( **self.default_test_kwargs, early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 3", - loss="my_loss(prediction, target) = (prediction - target)^2", + elementwise_loss="my_loss(prediction, target) = (prediction - target)^2", precision=64, parsimony=0.01, warm_start=True, ) model.fit(self.X, y) - from pysr.sr import Main # We should have that the model state is now a Float64 hof: - Main.test_state = model.raw_julia_state_ - self.assertTrue(Main.eval("typeof(test_state[2]).parameters[1] == Float64")) + test_state = model.raw_julia_state_ + self.assertTrue(jl.typeof(test_state[1]).parameters[1] == jl.Float64) + + # Test options stored: + self.assertEqual(model.julia_options_.turbo, False) def test_multioutput_custom_operator_quiet_custom_complexity(self): y = self.X[:, [0, 1]] ** 2 @@ -199,6 +217,7 @@ class TestPipeline(unittest.TestCase): **self.default_test_kwargs, early_stop_condition="(loss, complexity) -> loss <= 1e-4 && complexity <= 6", ) + model.niterations = DEFAULT_NITERATIONS * 10 model.fit(X, y) test_y = model.predict(X) self.assertTrue(np.issubdtype(test_y.dtype, np.complexfloating)) @@ -224,16 +243,17 @@ class TestPipeline(unittest.TestCase): # Test if repeated fit works: regressor.set_params( niterations=1, - ncyclesperiteration=2, + ncycles_per_iteration=2, warm_start=True, early_stop_condition=None, ) - # Check that the the julia state is saved: - from pysr.sr import Main # We should have that the model state is now a Float32 hof: - Main.test_state = regressor.raw_julia_state_ - self.assertTrue(Main.eval("typeof(test_state[2]).parameters[1] == Float32")) + test_state = regressor.julia_state_ + self.assertTrue( + jl.first(jl.typeof(jl.last(test_state)).parameters) == jl.Float32 + ) + # This should exit almost immediately, and use the old equations regressor.fit(X, y) @@ -548,6 +568,17 @@ class TestMiscellaneous(unittest.TestCase): # The correct value should be set: self.assertEqual(model.fraction_replaced, 0.2) + def test_deprecated_functions(self): + with self.assertWarns(FutureWarning): + install() + + _jl = None + + with self.assertWarns(FutureWarning): + _jl = init_julia() + + self.assertEqual(_jl, jl) + def test_power_law_warning(self): """Ensure that a warning is given for a power law operator.""" with self.assertWarns(UserWarning): @@ -594,23 +625,6 @@ class TestMiscellaneous(unittest.TestCase): with self.assertRaises(ValueError): model.fit(X, y) - def test_changed_options_warning(self): - """Check that a warning is given if Julia options are changed.""" - if julia_helpers.julia_kwargs_at_initialization is None: - julia_helpers.init_julia(julia_kwargs={"threads": 2, "optimize": 3}) - - cur_init = julia_helpers.julia_kwargs_at_initialization - - threads_to_change = cur_init["threads"] + 1 - with warnings.catch_warnings(): - warnings.simplefilter("error") - with self.assertRaises(Exception) as context: - julia_helpers.init_julia( - julia_kwargs={"threads": threads_to_change, "optimize": 3} - ) - self.assertIn("Julia has already started", str(context.exception)) - self.assertIn("threads", str(context.exception)) - def test_extra_sympy_mappings_undefined(self): """extra_sympy_mappings=None errors for custom operators""" model = PySRRegressor(unary_operators=["square2(x) = x^2"]) @@ -640,6 +654,50 @@ class TestMiscellaneous(unittest.TestCase): model.fit(X, y, variable_names=["f{c}"]) self.assertIn("Invalid variable name", str(cm.exception)) + def test_bad_kwargs(self): + bad_kwargs = [ + dict( + kwargs=dict( + elementwise_loss="g(x, y) = 0.0", loss_function="f(*args) = 0.0" + ), + error=ValueError, + ), + dict( + kwargs=dict(maxsize=3), + error=ValueError, + ), + dict( + kwargs=dict(tournament_selection_n=10, population_size=3), + error=ValueError, + ), + dict( + kwargs=dict(optimizer_algorithm="COBYLA"), + error=NotImplementedError, + ), + dict( + kwargs=dict( + constraints={ + "+": (3, 5), + } + ), + error=NotImplementedError, + ), + dict( + kwargs=dict(binary_operators=["α(x, y) = x - y"]), + error=ValueError, + ), + dict( + kwargs=dict(model_selection="unknown"), + error=NotImplementedError, + ), + ] + for opt in bad_kwargs: + model = PySRRegressor(**opt["kwargs"], niterations=1) + with self.assertRaises(opt["error"]): + model.fit([[1]], [1]) + model.get_best() + print("Failed", opt["kwargs"]) + def test_pickle_with_temp_equation_file(self): """If we have a temporary equation file, unpickle the estimator.""" model = PySRRegressor( @@ -678,7 +736,7 @@ class TestMiscellaneous(unittest.TestCase): model = PySRRegressor( niterations=int(1 + DEFAULT_NITERATIONS / 10), populations=int(1 + DEFAULT_POPULATIONS / 3), - ncyclesperiteration=int(2 + DEFAULT_NCYCLES / 10), + ncycles_per_iteration=int(2 + DEFAULT_NCYCLES / 10), verbosity=0, progress=False, random_state=0, @@ -715,6 +773,9 @@ class TestMiscellaneous(unittest.TestCase): def test_param_groupings(self): """Test that param_groupings are complete""" param_groupings_file = Path(__file__).parent.parent / "param_groupings.yml" + if not param_groupings_file.exists(): + return + # Read the file, discarding lines ending in ":", # and removing leading "\s*-\s*": params = [] @@ -1072,10 +1133,8 @@ class TestDimensionalConstraints(unittest.TestCase): # TODO: Determine desired behavior if second .fit() call does not have units -def runtests(): +def runtests(just_tests=False): """Run all tests in test.py.""" - suite = unittest.TestSuite() - loader = unittest.TestLoader() test_cases = [ TestPipeline, TestBest, @@ -1084,8 +1143,11 @@ def runtests(): TestLaTeXTable, TestDimensionalConstraints, ] + if just_tests: + return test_cases + suite = unittest.TestSuite() + loader = unittest.TestLoader() for test_case in test_cases: - tests = loader.loadTestsFromTestCase(test_case) - suite.addTests(tests) + suite.addTests(loader.loadTestsFromTestCase(test_case)) runner = unittest.TextTestRunner() return runner.run(suite) diff --git a/pysr/test/test_cli.py b/pysr/test/test_cli.py index 0a97a1eb17d0bf7b4e097bff7ac417c4e60034f8..08fbfa6cb02638067529f1a565a3ad35ecb61ab4 100644 --- a/pysr/test/test_cli.py +++ b/pysr/test/test_cli.py @@ -1,59 +1,83 @@ import unittest +from textwrap import dedent from click import testing as click_testing -from .._cli.main import pysr - - -class TestCli(unittest.TestCase): - # TODO: Include test for custom project here. - def setUp(self): - self.cli_runner = click_testing.CliRunner() - - def test_help_on_all_commands(self): - expected = "\n".join( - [ - "Usage: pysr [OPTIONS] COMMAND [ARGS]...", - "", - "Options:", - " --help Show this message and exit.", - "", - "Commands:", - " install Install Julia dependencies for PySR.", - "", - ] - ) - result = self.cli_runner.invoke(pysr, ["--help"]) - self.assertEqual(expected, result.output) - self.assertEqual(0, result.exit_code) - - def test_help_on_install(self): - expected = "\n".join( - [ - "Usage: pysr install [OPTIONS]", - "", - " Install Julia dependencies for PySR.", - "", - "Options:", - " -p, --project PROJECT_DIRECTORY", - " Install in a specific Julia project (e.g., a", - " local copy of SymbolicRegression.jl).", - " -q, --quiet Disable logging.", - " --precompile Force precompilation of Julia libraries.", - " --no-precompile Disable precompilation.", - " --help Show this message and exit.", - "", - ] - ) - result = self.cli_runner.invoke(pysr, ["install", "--help"]) - self.assertEqual(expected, result.output) - self.assertEqual(0, result.exit_code) - - -def runtests(): - """Run all tests in cliTest.py.""" - loader = unittest.TestLoader() - suite = unittest.TestSuite() - suite.addTests(loader.loadTestsFromTestCase(TestCli)) - runner = unittest.TextTestRunner() - return runner.run(suite) + +def get_runtests(): + # Lazy load to avoid circular imports. + + from .._cli.main import pysr + + class TestCli(unittest.TestCase): + # TODO: Include test for custom project here. + def setUp(self): + self.cli_runner = click_testing.CliRunner() + + def test_help_on_all_commands(self): + expected = dedent( + """ + Usage: pysr [OPTIONS] COMMAND [ARGS]... + + Options: + --help Show this message and exit. + + Commands: + install DEPRECATED (dependencies are now installed at import). + test Run parts of the PySR test suite. + """ + ) + result = self.cli_runner.invoke(pysr, ["--help"]) + self.assertEqual(result.output.strip(), expected.strip()) + self.assertEqual(result.exit_code, 0) + + def test_help_on_install(self): + expected = dedent( + """ + Usage: pysr install [OPTIONS] + + DEPRECATED (dependencies are now installed at import). + + Options: + -p, --project TEXT + -q, --quiet Disable logging. + --precompile + --no-precompile + --help Show this message and exit. + """ + ) + result = self.cli_runner.invoke(pysr, ["install", "--help"]) + self.assertEqual(result.output.strip(), expected.strip()) + self.assertEqual(result.exit_code, 0) + + def test_help_on_test(self): + expected = dedent( + """ + Usage: pysr test [OPTIONS] TESTS + + Run parts of the PySR test suite. + + Choose from main, jax, torch, cli, dev, and startup. You can give multiple + tests, separated by commas. + + Options: + --help Show this message and exit. + """ + ) + result = self.cli_runner.invoke(pysr, ["test", "--help"]) + self.assertEqual(result.output.strip(), expected.strip()) + self.assertEqual(result.exit_code, 0) + + def runtests(just_tests=False): + """Run all tests in cliTest.py.""" + tests = [TestCli] + if just_tests: + return tests + loader = unittest.TestLoader() + suite = unittest.TestSuite() + for test in tests: + suite.addTests(loader.loadTestsFromTestCase(test)) + runner = unittest.TextTestRunner() + return runner.run(suite) + + return runtests diff --git a/pysr/test/test_dev.py b/pysr/test/test_dev.py new file mode 100644 index 0000000000000000000000000000000000000000..b8a2b4645ebd86f03623cb055736c7f04057dfe7 --- /dev/null +++ b/pysr/test/test_dev.py @@ -0,0 +1,59 @@ +import os +import subprocess +import unittest +from pathlib import Path + + +class TestDev(unittest.TestCase): + def test_simple_change_to_backend(self): + """Test that we can use a development version of SymbolicRegression.jl""" + PYSR_TEST_JULIA_VERSION = os.environ.get("PYSR_TEST_JULIA_VERSION", "1.6") + PYSR_TEST_PYTHON_VERSION = os.environ.get("PYSR_TEST_PYTHON_VERSION", "3.9") + build_result = subprocess.run( + [ + "docker", + "build", + "-t", + "pysr-dev", + "--build-arg", + f"JLVERSION={PYSR_TEST_JULIA_VERSION}", + "--build-arg", + f"PYVERSION={PYSR_TEST_PYTHON_VERSION}", + "-f", + "pysr/test/test_dev_pysr.dockerfile", + ".", + ], + env=os.environ, + cwd=Path(__file__).parent.parent.parent, + universal_newlines=True, + ) + self.assertEqual(build_result.returncode, 0) + test_result = subprocess.run( + [ + "docker", + "run", + "--rm", + "pysr-dev", + "python3", + "-c", + "from pysr import SymbolicRegression as SR; print(SR.__test_function())", + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=os.environ, + cwd=Path(__file__).parent.parent.parent, + ) + self.assertEqual(test_result.returncode, 0) + self.assertEqual(test_result.stdout.decode("utf-8").strip(), "2.3") + + +def runtests(just_tests=False): + tests = [TestDev] + if just_tests: + return tests + suite = unittest.TestSuite() + loader = unittest.TestLoader() + for test in tests: + suite.addTests(loader.loadTestsFromTestCase(test)) + runner = unittest.TextTestRunner() + return runner.run(suite) diff --git a/pysr/test/test_dev_pysr.dockerfile b/pysr/test/test_dev_pysr.dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..b49df455ee0d0b3b8e947521d9e49810ff7ac0f8 --- /dev/null +++ b/pysr/test/test_dev_pysr.dockerfile @@ -0,0 +1,57 @@ +# This dockerfile simulates a user installation that +# tries to manually edit SymbolicRegression.jl and +# use it from PySR. + +ARG JLVERSION=1.9.4 +ARG PYVERSION=3.11.6 +ARG BASE_IMAGE=bullseye + +FROM julia:${JLVERSION}-${BASE_IMAGE} AS jl +FROM python:${PYVERSION}-${BASE_IMAGE} + +# Merge Julia image: +COPY --from=jl /usr/local/julia /usr/local/julia +ENV PATH="/usr/local/julia/bin:${PATH}" + +WORKDIR /pysr + +# Caches install (https://stackoverflow.com/questions/25305788/how-to-avoid-reinstalling-packages-when-building-docker-image-for-python-project) +ADD ./requirements.txt /pysr/requirements.txt +RUN pip3 install --no-cache-dir -r /pysr/requirements.txt + +# Install PySR: +# We do a minimal copy so it doesn't need to rerun at every file change: +ADD ./pyproject.toml /pysr/pyproject.toml +ADD ./setup.py /pysr/setup.py + +RUN mkdir /pysr/pysr +ADD ./pysr/*.py /pysr/pysr/ +ADD ./pysr/juliapkg.json /pysr/pysr/juliapkg.json + +RUN mkdir /pysr/pysr/_cli +ADD ./pysr/_cli/*.py /pysr/pysr/_cli/ + +RUN mkdir /pysr/pysr/test + +RUN pip3 install --no-cache-dir . + +# Now, we create a custom version of SymbolicRegression.jl +# First, we get the version from juliapkg.json: +RUN python3 -c 'import json; print(json.load(open("/pysr/pysr/juliapkg.json", "r"))["packages"]["SymbolicRegression"]["version"])' > /pysr/sr_version + +# Remove any = or ^ or ~ from the version: +RUN cat /pysr/sr_version | sed 's/[\^=~]//g' > /pysr/sr_version_processed + +# Now, we check out the version of SymbolicRegression.jl that PySR is using: +RUN git clone -b "v$(cat /pysr/sr_version_processed)" --single-branch https://github.com/MilesCranmer/SymbolicRegression.jl /srjl + +# Edit SymbolicRegression.jl to create a new function. +# We want to put this function immediately after `module SymbolicRegression`: +RUN sed -i 's/module SymbolicRegression/module SymbolicRegression\n__test_function() = 2.3/' /srjl/src/SymbolicRegression.jl + +# Edit PySR to use the custom version of SymbolicRegression.jl: +ADD ./pysr/test/generate_dev_juliapkg.py /generate_dev_juliapkg.py +RUN python3 /generate_dev_juliapkg.py /pysr/pysr/juliapkg.json /srjl + +# Precompile +RUN python3 -c 'import pysr' diff --git a/pysr/test/test_env.py b/pysr/test/test_env.py deleted file mode 100644 index 423a3064ad792cacba5e66effb85546850003dc0..0000000000000000000000000000000000000000 --- a/pysr/test/test_env.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Contains tests for creating and initializing custom Julia projects.""" - -import os -import unittest -from tempfile import TemporaryDirectory - -from .. import julia_helpers - - -class TestJuliaProject(unittest.TestCase): - """Various tests for working with Julia projects.""" - - def test_custom_shared_env(self): - """Test that we can use PySR in a custom shared env.""" - with TemporaryDirectory() as tmpdir: - # Create a temp depot to store julia packages (and our custom env) - Main = julia_helpers.init_julia() - - # Set up env: - if "JULIA_DEPOT_PATH" not in os.environ: - old_env = None - os.environ["JULIA_DEPOT_PATH"] = tmpdir - else: - old_env = os.environ["JULIA_DEPOT_PATH"] - os.environ[ - "JULIA_DEPOT_PATH" - ] = f"{tmpdir}:{os.environ['JULIA_DEPOT_PATH']}" - Main.eval( - f'pushfirst!(DEPOT_PATH, "{julia_helpers._escape_filename(tmpdir)}")' - ) - test_env_name = "@pysr_test_env" - julia_helpers.install(julia_project=test_env_name) - Main = julia_helpers.init_julia(julia_project=test_env_name) - - # Try to use env: - Main.eval("using SymbolicRegression") - Main.eval("using Pkg") - - # Assert we actually loaded it: - cur_project_dir = Main.eval("splitdir(dirname(Base.active_project()))[1]") - potential_shared_project_dirs = Main.eval("Pkg.envdir(DEPOT_PATH[1])") - self.assertEqual(cur_project_dir, potential_shared_project_dirs) - - # Clean up: - Main.eval("pop!(DEPOT_PATH)") - if old_env is None: - del os.environ["JULIA_DEPOT_PATH"] - else: - os.environ["JULIA_DEPOT_PATH"] = old_env - - -def runtests(): - """Run all tests in test_env.py.""" - loader = unittest.TestLoader() - suite = unittest.TestSuite() - suite.addTests(loader.loadTestsFromTestCase(TestJuliaProject)) - runner = unittest.TextTestRunner() - return runner.run(suite) diff --git a/pysr/test/test_jax.py b/pysr/test/test_jax.py index aaafb97cf6b3be04987c56df7f2c87c83baf3ae6..5e4e5ef18b6d995f15ae97a66617c486384f75a6 100644 --- a/pysr/test/test_jax.py +++ b/pysr/test/test_jax.py @@ -121,10 +121,14 @@ class TestJAX(unittest.TestCase): np.testing.assert_almost_equal(y.values, jax_output, decimal=3) -def runtests(): +def runtests(just_tests=False): """Run all tests in test_jax.py.""" + tests = [TestJAX] + if just_tests: + return tests loader = unittest.TestLoader() suite = unittest.TestSuite() - suite.addTests(loader.loadTestsFromTestCase(TestJAX)) + for test in tests: + suite.addTests(loader.loadTestsFromTestCase(test)) runner = unittest.TextTestRunner() return runner.run(suite) diff --git a/pysr/test/test_nb.ipynb b/pysr/test/test_nb.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..d4a059a8e149a6786188c0d48a3bfca684b8cbaa --- /dev/null +++ b/pysr/test/test_nb.ipynb @@ -0,0 +1,536 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "import numpy as np\n", + "from pysr import PySRRegressor, jl" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3\n" + ] + } + ], + "source": [ + "%%julia\n", + "\n", + "# Automatically activates Julia magic\n", + "\n", + "x = 1\n", + "println(x + 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4\n" + ] + } + ], + "source": [ + "%julia println(x + 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
PySRRegressor.equations_ = None
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "PySRRegressor.equations_ = None" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rstate = np.random.RandomState(0)\n", + "X = np.random.randn(10, 2)\n", + "y = np.random.randn(10)\n", + "\n", + "model = PySRRegressor(deterministic=True, multithreading=False, procs=0, random_state=0, verbosity=0, progress=False)\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mcranmer/PermaDocuments/SymbolicRegressionMonorepo/.venv/lib/python3.12/site-packages/pysr/sr.py:1297: UserWarning: Note: it looks like you are running in Jupyter. The progress bar will be turned off.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "pandas.core.frame.DataFrame" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.fit(X, y)\n", + "type(model.equations_)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pysr/test/test_startup.py b/pysr/test/test_startup.py new file mode 100644 index 0000000000000000000000000000000000000000..8823ccf01d03e7630e26841056240dabc5724d2d --- /dev/null +++ b/pysr/test/test_startup.py @@ -0,0 +1,164 @@ +import os +import platform +import subprocess +import sys +import tempfile +import textwrap +import unittest +from pathlib import Path + +import numpy as np + +from .. import PySRRegressor +from ..julia_import import jl_version +from .params import DEFAULT_NITERATIONS, DEFAULT_POPULATIONS + + +class TestStartup(unittest.TestCase): + """Various tests related to starting up PySR.""" + + def setUp(self): + # Using inspect, + # get default niterations from PySRRegressor, and double them: + self.default_test_kwargs = dict( + progress=False, + model_selection="accuracy", + niterations=DEFAULT_NITERATIONS * 2, + populations=DEFAULT_POPULATIONS * 2, + temp_equation_file=True, + ) + self.rstate = np.random.RandomState(0) + self.X = self.rstate.randn(100, 5) + + def test_warm_start_from_file(self): + """Test that we can warm start in another process.""" + if platform.system() == "Windows": + self.skipTest("Warm start test incompatible with Windows") + + with tempfile.TemporaryDirectory() as tmpdirname: + model = PySRRegressor( + **self.default_test_kwargs, + unary_operators=["cos"], + ) + model.warm_start = True + model.temp_equation_file = False + model.equation_file = Path(tmpdirname) / "equations.csv" + model.deterministic = True + model.multithreading = False + model.random_state = 0 + model.procs = 0 + model.early_stop_condition = 1e-10 + + rstate = np.random.RandomState(0) + X = rstate.randn(100, 2) + y = np.cos(X[:, 0]) ** 2 + model.fit(X, y) + + best_loss = model.equations_.iloc[-1]["loss"] + + # Save X and y to a file: + X_file = Path(tmpdirname) / "X.npy" + y_file = Path(tmpdirname) / "y.npy" + np.save(X_file, X) + np.save(y_file, y) + # Now, create a new process and warm start from the file: + result = subprocess.run( + [ + sys.executable, + "-c", + textwrap.dedent( + f""" + from pysr import PySRRegressor + import numpy as np + + X = np.load("{X_file}") + y = np.load("{y_file}") + + print("Loading model from file") + model = PySRRegressor.from_file("{model.equation_file}") + + assert model.julia_state_ is not None + + # Reset saved equations; should be loaded from state! + model.equations_ = None + model.equation_file_contents_ = None + + model.warm_start = True + model.niterations = 0 + model.max_evals = 0 + model.ncycles_per_iteration = 0 + + model.fit(X, y) + + best_loss = model.equations_.iloc[-1]["loss"] + + assert best_loss <= {best_loss} + """ + ), + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=os.environ, + ) + self.assertEqual(result.returncode, 0) + self.assertIn("Loading model from file", result.stdout.decode()) + self.assertIn("Started!", result.stderr.decode()) + + def test_bad_startup_options(self): + warning_tests = [ + dict( + code='import os; os.environ["PYTHON_JULIACALL_HANDLE_SIGNALS"] = "no"; import pysr', + msg="PYTHON_JULIACALL_HANDLE_SIGNALS environment variable is set", + ), + dict( + code='import os; os.environ["PYTHON_JULIACALL_THREADS"] = "1"; import pysr', + msg="PYTHON_JULIACALL_THREADS environment variable is set", + ), + dict( + code="import juliacall; import pysr", + msg="juliacall module already imported.", + ), + dict( + code='import os; os.environ["PYSR_AUTOLOAD_EXTENSIONS"] = "foo"; import pysr', + msg="PYSR_AUTOLOAD_EXTENSIONS environment variable is set", + ), + ] + for warning_test in warning_tests: + result = subprocess.run( + [sys.executable, "-c", warning_test["code"]], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=os.environ, + ) + self.assertIn(warning_test["msg"], result.stderr.decode()) + + def test_notebook(self): + if jl_version < (1, 9, 0): + self.skipTest("Julia version too old") + if platform.system() == "Windows": + self.skipTest("Notebook test incompatible with Windows") + result = subprocess.run( + [ + sys.executable, + "-m", + "pytest", + "--nbval", + str(Path(__file__).parent / "test_nb.ipynb"), + "--nbval-sanitize-with", + str(Path(__file__).parent / "nb_sanitize.cfg"), + ], + env=os.environ, + ) + self.assertEqual(result.returncode, 0) + + +def runtests(just_tests=False): + tests = [TestStartup] + if just_tests: + return tests + suite = unittest.TestSuite() + loader = unittest.TestLoader() + for test in tests: + suite.addTests(loader.loadTestsFromTestCase(test)) + runner = unittest.TextTestRunner() + return runner.run(suite) diff --git a/pysr/test/test_torch.py b/pysr/test/test_torch.py index 5a71af71ff55decc488a86fb47cbe99bb90a231d..35055c6a08b4502f55f286711fd9b650d3eb07c7 100644 --- a/pysr/test/test_torch.py +++ b/pysr/test/test_torch.py @@ -1,4 +1,3 @@ -import platform import unittest import numpy as np @@ -7,42 +6,28 @@ import sympy from .. import PySRRegressor, sympy2torch -# Need to initialize Julia before importing torch... - - -def _import_torch(): - if platform.system() == "Darwin": - # Import PyJulia, then Torch - from ..julia_helpers import init_julia - - init_julia() - - import torch - else: - # Import Torch, then PyJulia - # https://github.com/pytorch/pytorch/issues/78829 - import torch - return torch - class TestTorch(unittest.TestCase): def setUp(self): np.random.seed(0) + # Need to import after juliacall: + import torch + + self.torch = torch + def test_sympy2torch(self): - torch = _import_torch() x, y, z = sympy.symbols("x y z") cosx = 1.0 * sympy.cos(x) + y - X = torch.tensor(np.random.randn(1000, 3)) - true = 1.0 * torch.cos(X[:, 0]) + X[:, 1] + X = self.torch.tensor(np.random.randn(1000, 3)) + true = 1.0 * self.torch.cos(X[:, 0]) + X[:, 1] torch_module = sympy2torch(cosx, [x, y, z]) self.assertTrue( np.all(np.isclose(torch_module(X).detach().numpy(), true.detach().numpy())) ) def test_pipeline_pandas(self): - torch = _import_torch() X = pd.DataFrame(np.random.randn(100, 10)) y = np.ones(X.shape[0]) model = PySRRegressor( @@ -71,13 +56,12 @@ class TestTorch(unittest.TestCase): self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)") np.testing.assert_almost_equal( - tformat(torch.tensor(X.values)).detach().numpy(), + tformat(self.torch.tensor(X.values)).detach().numpy(), np.square(np.cos(X.values[:, 1])), # Selection 1st feature decimal=3, ) def test_pipeline(self): - torch = _import_torch() X = np.random.randn(100, 10) y = np.ones(X.shape[0]) model = PySRRegressor( @@ -106,22 +90,22 @@ class TestTorch(unittest.TestCase): self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)") np.testing.assert_almost_equal( - tformat(torch.tensor(X)).detach().numpy(), + tformat(self.torch.tensor(X)).detach().numpy(), np.square(np.cos(X[:, 1])), # 2nd feature decimal=3, ) def test_mod_mapping(self): - torch = _import_torch() x, y, z = sympy.symbols("x y z") expression = x**2 + sympy.atanh(sympy.Mod(y + 1, 2) - 1) * 3.2 * z module = sympy2torch(expression, [x, y, z]) - X = torch.rand(100, 3).float() * 10 + X = self.torch.rand(100, 3).float() * 10 true_out = ( - X[:, 0] ** 2 + torch.atanh(torch.fmod(X[:, 1] + 1, 2) - 1) * 3.2 * X[:, 2] + X[:, 0] ** 2 + + self.torch.atanh(self.torch.fmod(X[:, 1] + 1, 2) - 1) * 3.2 * X[:, 2] ) torch_out = module(X) @@ -130,7 +114,6 @@ class TestTorch(unittest.TestCase): ) def test_custom_operator(self): - torch = _import_torch() X = np.random.randn(100, 3) y = np.ones(X.shape[0]) model = PySRRegressor( @@ -156,7 +139,7 @@ class TestTorch(unittest.TestCase): model.set_params( equation_file="equation_file_custom_operator.csv", extra_sympy_mappings={"mycustomoperator": sympy.sin}, - extra_torch_mappings={"mycustomoperator": torch.sin}, + extra_torch_mappings={"mycustomoperator": self.torch.sin}, ) model.refresh(checkpoint_file="equation_file_custom_operator.csv") self.assertEqual(str(model.sympy()), "sin(x1)") @@ -165,13 +148,12 @@ class TestTorch(unittest.TestCase): tformat = model.pytorch() self.assertEqual(str(tformat), "_SingleSymPyModule(expression=sin(x1))") np.testing.assert_almost_equal( - tformat(torch.tensor(X)).detach().numpy(), + tformat(self.torch.tensor(X)).detach().numpy(), np.sin(X[:, 1]), decimal=3, ) def test_feature_selection_custom_operators(self): - torch = _import_torch() rstate = np.random.RandomState(0) X = pd.DataFrame({f"k{i}": rstate.randn(2000) for i in range(10, 21)}) cos_approx = lambda x: 1 - (x**2) / 2 + (x**4) / 24 + (x**6) / 720 @@ -196,16 +178,20 @@ class TestTorch(unittest.TestCase): np_output = model.predict(X.values) - torch_output = torch_module(torch.tensor(X.values)).detach().numpy() + torch_output = torch_module(self.torch.tensor(X.values)).detach().numpy() np.testing.assert_almost_equal(y.values, np_output, decimal=3) np.testing.assert_almost_equal(y.values, torch_output, decimal=3) -def runtests(): +def runtests(just_tests=False): """Run all tests in test_torch.py.""" + tests = [TestTorch] + if just_tests: + return tests loader = unittest.TestLoader() suite = unittest.TestSuite() - suite.addTests(loader.loadTestsFromTestCase(TestTorch)) + for test in tests: + suite.addTests(loader.loadTestsFromTestCase(test)) runner = unittest.TextTestRunner() return runner.run(suite) diff --git a/pysr/version.py b/pysr/version.py deleted file mode 100644 index d88a664ece725aa437b06214244d36c03fd0a34f..0000000000000000000000000000000000000000 --- a/pysr/version.py +++ /dev/null @@ -1,2 +0,0 @@ -__version__ = "0.16.9" -__symbolic_regression_jl_version__ = "0.23.1" diff --git a/requirements.txt b/requirements.txt index ffb3e0d63d3ac89b83087a57ae370abdeef5a200..86e93ecde465a19d76d9404e9a84501db52ad4a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ sympy>=1.0.0,<2.0.0 pandas>=0.21.0,<3.0.0 numpy>=1.13.0,<2.0.0 scikit_learn>=1.0.0,<2.0.0 -julia>=0.6.0,<0.7.0 +juliacall==0.9.15 click>=7.0.0,<9.0.0 setuptools>=50.0.0 typing_extensions>=4.0.0,<5.0.0; python_version < "3.8" diff --git a/setup.py b/setup.py index d9f026f461460b720f336e3189a4cc98ce9a5ee2..2cf7ba8eb9f485d13d299f78d37e41e187c06231 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,32 @@ -import setuptools +import os -try: - with open("README.md", "r", encoding="utf8") as fh: - long_description = fh.read() -except FileNotFoundError: - long_description = "" +from setuptools import setup -exec(open("pysr/version.py").read()) +if os.path.exists(".git"): + kwargs = { + "use_scm_version": { + "write_to": "pysr/version.py", + }, + "setup_requires": ["setuptools", "setuptools_scm"], + } +else: + # Read from pyproject.toml directly + import re -setuptools.setup( - name="pysr", - version=__version__, - author="Miles Cranmer", - author_email="miles.cranmer@gmail.com", - description="Simple and efficient symbolic regression", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/MilesCranmer/pysr", - # Read from requirements.txt: - install_requires=open("requirements.txt").read().splitlines(), - packages=setuptools.find_packages(), - package_data={"pysr": ["../Project.toml", "../datasets/*"]}, - include_package_data=False, - classifiers=[ - "Programming Language :: Python :: 3", - "Operating System :: OS Independent", - ], - python_requires=">=3.7", -) + with open(os.path.join(os.path.dirname(__file__), "pyproject.toml")) as f: + data = f.read() + # Find the version + version = re.search(r'version = "(.*)"', data).group(1) + + # Write the version to version.py + with open(os.path.join(os.path.dirname(__file__), "pysr", "version.py"), "w") as f: + f.write(f'__version__ = "{version}"') + + kwargs = { + "use_scm_version": False, + "version": version, + } + + +# Build options are managed in pyproject.toml +setup(**kwargs)