Spaces:

tree3po
/

kinet-test

Runtime error

App Files Files Community

tree3po commited on Nov 13, 2024

Commit

e0f25ed

verified ·

1 Parent(s): d64c84d

Upload 190 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +8 -0
Kinetix/.gitignore +194 -0
Kinetix/.pre-commit-config.yaml +7 -0
Kinetix/LICENSE +19 -0
Kinetix/README.md +217 -0
Kinetix/configs/editor.yaml +22 -0
Kinetix/configs/env/entity.yaml +3 -0
Kinetix/configs/env/symbolic.yaml +3 -0
Kinetix/configs/env_size/custom.yaml +3 -0
Kinetix/configs/env_size/l.yaml +8 -0
Kinetix/configs/env_size/m.yaml +8 -0
Kinetix/configs/env_size/s.yaml +8 -0
Kinetix/configs/eval/eval_all.yaml +82 -0
Kinetix/configs/eval/eval_auto.yaml +4 -0
Kinetix/configs/eval/eval_general.yaml +7 -0
Kinetix/configs/eval/l.yaml +46 -0
Kinetix/configs/eval/m.yaml +30 -0
Kinetix/configs/eval/mujoco.yaml +13 -0
Kinetix/configs/eval/s.yaml +16 -0
Kinetix/configs/eval_env_size/l.yaml +7 -0
Kinetix/configs/eval_env_size/m.yaml +7 -0
Kinetix/configs/eval_env_size/s.yaml +7 -0
Kinetix/configs/learning/ppo-base.yaml +20 -0
Kinetix/configs/learning/ppo-rnn.yaml +2 -0
Kinetix/configs/learning/ppo-sfl.yaml +1 -0
Kinetix/configs/learning/ppo-ued.yaml +2 -0
Kinetix/configs/misc/misc.yaml +16 -0
Kinetix/configs/model/model-base.yaml +4 -0
Kinetix/configs/model/model-transformer.yaml +6 -0
Kinetix/configs/plr.yaml +17 -0
Kinetix/configs/ppo.yaml +20 -0
Kinetix/configs/sfl.yaml +21 -0
Kinetix/configs/train_levels/l.yaml +44 -0
Kinetix/configs/train_levels/m.yaml +28 -0
Kinetix/configs/train_levels/mujoco.yaml +11 -0
Kinetix/configs/train_levels/random.yaml +2 -0
Kinetix/configs/train_levels/s.yaml +14 -0
Kinetix/configs/train_levels/train_all.yaml +80 -0
Kinetix/configs/ued/accel.yaml +16 -0
Kinetix/configs/ued/plr.yaml +17 -0
Kinetix/configs/ued/sfl.yaml +9 -0
Kinetix/docs/README.md +83 -0
Kinetix/docs/configs.md +179 -0
Kinetix/examples/example_premade_level_replay.py +46 -0
Kinetix/examples/example_random_level_replay.py +51 -0
Kinetix/experiments/plr.py +1143 -0
Kinetix/experiments/ppo.py +468 -0
Kinetix/experiments/sfl.py +1067 -0
Kinetix/images/bb.gif +0 -0
Kinetix/images/cartpole.gif +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Kinetix/images/general_2.gif filter=lfs diff=lfs merge=lfs -text
+Kinetix/images/kinetix_logo.gif filter=lfs diff=lfs merge=lfs -text
+Kinetix/images/random_1.gif filter=lfs diff=lfs merge=lfs -text
+Kinetix/images/random_3.gif filter=lfs diff=lfs merge=lfs -text
+Kinetix/images/random_4.gif filter=lfs diff=lfs merge=lfs -text
+Kinetix/images/random_5.gif filter=lfs diff=lfs merge=lfs -text
+Kinetix/images/random_6.gif filter=lfs diff=lfs merge=lfs -text
+Kinetix/images/random_7.gif filter=lfs diff=lfs merge=lfs -text

Kinetix/.gitignore ADDED Viewed

	@@ -0,0 +1,194 @@

+tmp/
+wandb/
+runs/
+play_data
+checkpoints
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache*
+.cache_*
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+!configs/env
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+texture_cache.pbz2
+texture_cache*.pbz2
+profile*
+wandb_key
+test.py
+outputs
+lol*
+.cache-location
+experiments/ppo_old.py
+.bash_history
+logs/
+.vscode
+kinetix/util/old_learning_with_mask.py
+offline/datasets/*.pkl
+all_sweeps
+worlds/games
+artifacts
+log*_*
+kinetix/analysis/test*.py
+slurm-*.out
+results/

Kinetix/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+repos:
+-   repo: https://github.com/psf/black
+    rev: 22.3.0
+    hooks:
+    - id: black
+      language_version: python3
+      args: [--line-length=120]

Kinetix/LICENSE ADDED Viewed

	@@ -0,0 +1,19 @@

+Copyright (c) 2024 Michael Matthews
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Kinetix/README.md ADDED Viewed

	@@ -0,0 +1,217 @@

+<p align="middle">
+  <img src="images/kinetix_logo.gif" width="500" />
+</p>
+<p align="center">
+        <a href= "https://pypi.org/project/jax2d/">
+        <img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue" /></a>
+       <a href= "https://github.com/FLAIROx/Kinetix/blob/main/LICENSE">
+        <img src="https://img.shields.io/badge/License-MIT-yellow" /></a>
+       <a href= "https://github.com/psf/black">
+        <img src="https://img.shields.io/badge/code%20style-black-000000.svg" /></a>
+       <a href= "https://kinetix-env.github.io/">
+        <img src="https://img.shields.io/badge/online-editor-purple" /></a>
+       <a href= "https://arxiv.org/abs/2410.23208">
+        <img src="https://img.shields.io/badge/arxiv-2410.23208-b31b1b" /></a>
+        <a href= "./docs/README.md">
+        <img src="https://img.shields.io/badge/docs-green" /></a>
+</p>
+# Kinetix
+Kinetix is a framework for reinforcement learning in a 2D rigid-body physics world, written entirely in [JAX](https://github.com/jax-ml/jax).
+Kinetix can represent a huge array of physics-based tasks within a unified framework.
+We use Kinetix to investigate the training of large, general reinforcement learning agents by procedurally generating millions of tasks for training.
+You can play with Kinetix in our [online editor](https://kinetix-env.github.io/), or have a look at the JAX [physics engine](https://github.com/MichaelTMatthews/Jax2D) and [graphics library](https://github.com/FLAIROx/JaxGL) we made for Kinetix. Finally, see our [docs](./docs/README.md) for more information and more in-depth examples.
+<p align="middle">
+  <img src="images/bb.gif" width="200" />
+  <img src="images/cartpole.gif" width="200" />
+  <img src="images/grasper.gif" width="200" />
+</p>
+<p align="middle">
+  <img src="images/hc.gif" width="200" />
+  <img src="images/hopper.gif" width="200" />
+  <img src="images/ll.gif" width="200" />
+</p>
+<p align="middle">
+<b>The above shows specialist agents trained on their respective levels.</b>
+</p>
+# 📊 Paper TL; DR
+We train a general agent on millions of procedurally generated physics tasks.
+Every task has the same goal: make the <span style="color:green">green</span> and <span style="color:blue">blue</span> touch, without <span style="color:green">green</span> touching <span style="color:red">red</span>.
+The agent can act through applying torque via motors and force via thrusters.
+<p align="middle">
+  <img src="images/random_1.gif" width="200" />
+  <img src="images/random_5.gif" width="200" />
+  <img src="images/random_3.gif" width="200" />
+</p>
+<p align="middle">
+  <img src="images/random_4.gif" width="200" />
+  <img src="images/random_6.gif" width="200" />
+  <img src="images/random_7.gif" width="200" />
+</p>
+<p align="middle">
+<b>The above shows a general agent zero-shotting unseen randomly generated levels.</b>
+</p>
+We then investigate the transfer capabilities of this agent to unseen handmade levels.
+We find that the agent can zero-shot simple physics problems, but still struggles with harder tasks.
+<p align="middle">
+  <img src="images/general_1.gif" width="200" />
+  <img src="images/general_2.gif" width="200" />
+  <img src="images/general_3.gif" width="200" />
+</p>
+<p align="middle">
+  <img src="images/general_4.gif" width="200" />
+  <img src="images/general_5.gif" width="200" />
+  <img src="images/general_6.gif" width="200" />
+</p>
+<p align="middle">
+<b>The above shows a general agent zero-shotting unseen handmade levels.</b>
+</p>
+# 📜 Basic Usage
+Kinetix follows the interfaces established in [gymnax](https://github.com/RobertTLange/gymnax) and [jaxued](https://github.com/DramaCow/jaxued):
+```python
+# Use default parameters
+env_params = EnvParams()
+static_env_params = StaticEnvParams()
+ued_params = UEDParams()
+# Create the environment
+env = make_kinetix_env_from_args(
+    obs_type="pixels",
+    action_type="multidiscrete",
+    reset_type="replay",
+    static_env_params=static_env_params,
+)
+# Sample a random level
+rng = jax.random.PRNGKey(0)
+rng, _rng = jax.random.split(rng)
+level = sample_kinetix_level(_rng, env.physics_engine, env_params, static_env_params, ued_params)
+# Reset the environment state to this level
+rng, _rng = jax.random.split(rng)
+obs, env_state = env.reset_to_level(_rng, level, env_params)
+# Take a step in the environment
+rng, _rng = jax.random.split(rng)
+action = env.action_space(env_params).sample(_rng)
+rng, _rng = jax.random.split(rng)
+obs, env_state, reward, done, info = env.step(_rng, env_state, action, env_params)
+```
+# ⬇️ Installation
+To install Kinetix with a CUDA-enabled JAX backend (tested with python3.10):
+```commandline
+git clone https://github.com/FlairOx/Kinetix.git
+cd Kinetix
+pip install -e .
+pre-commit install
+```
+# 🎯 Editor
+We recommend using the [KinetixJS editor](https://kinetix-env.github.io/gallery.html?editor=true), but also provide a native (less polished) Kinetix editor.
+To open this editor run the following command.
+```commandline
+python3 kinetix/editor.py
+```
+The controls in the editor are:
+- Move between `edit` and `play` modes using `spacebar`
+- In `edit` mode, the type of edit is shown by the icon at the top and is changed by scrolling the mouse wheel.  For instance, by navigating to the rectangle editing function you can click to place a rectangle.
+  - You can also press the number keys to cycle between modes.
+- To open handmade levels press ctrl-O and navigate to the ones in the L folder.
+- **When playing a level use the arrow keys to control motors and the numeric keys (1, 2) to control thrusters.**
+# 📈 Experiments
+We have three primary experiment files,
+1. [**SFL**](https://github.com/amacrutherford/sampling-for-learnability?tab=readme-ov-file): Training on levels with high learnability, this is how we trained our best general agents.
+2. **PLR** PLR/DR/ACCEL in the [JAXUED](https://github.com/DramaCow/jaxued) style.
+3. **PPO** Normal PPO in the [PureJaxRL](https://github.com/luchris429/purejaxrl/) style.
+To run experiments with default parameters run any of the following:
+```commandline
+python3 experiments/sfl.py
+python3 experiments/plr.py
+python3 experiments/ppo.py
+```
+We use [hydra](https://hydra.cc/) for managing our configs.  See the `configs/` folder for all the hydra configs that will be used by default.
+If you want to run experiments with different configurations, you can either edit these configs or pass command line arguments as so:
+```commandline
+python3 experiments/sfl.py model.transformer_depth=8
+```
+These experiments use [wandb](https://wandb.ai/home) for logging by default.
+## 🏋️ Training RL Agents
+We provide several different ways to train RL agents, with the three most common options being, (a) [Training an agent on random levels](#training-on-random-levels), (b) [Training an agent on a single, hand-designed level](#training-on-a-single-hand-designed-level) or (c) [Training an agent on a set of hand-designed levels](#training-on-a-set-of-hand-designed-levels).
+> [!WARNING]
+> Kinetix has three different environment sizes, `s`, `m` and `l`. When running any of the scripts, you have to set the `env_size` option accordingly, for instance, `python3 experiments/ppo.py train_levels=random env_size=m` would train on random `m` levels.
+> It will give an error if you try and load large levels into a small env size, for instance `python3 experiments/ppo.py train_levels=m env_size=s` would error.
+### Training on random levels
+This is the default option, but we give the explicit command for completeness
+```commandline
+python3 experiments/ppo.py train_levels=random
+```
+### Training on a single hand-designed level
+> [!NOTE]
+> Check the `worlds/` folder for handmade levels for each size category. By default, the loading functions require a relative path to the `worlds/` directory
+```commandline
+python3 experiments/ppo.py train_levels=s train_levels.train_levels_list='["s/h4_thrust_aim.json"]'
+```
+### Training on a set of hand-designed levels
+```commandline
+python3 experiments/ppo.py train_levels=s env_size=s eval_env_size=s
+# python3 experiments/ppo.py train_levels=m env_size=m  eval_env_size=m
+# python3 experiments/ppo.py train_levels=l env_size=l  eval_env_size=l
+```
+Or, on a custom set:
+```commandline
+python3 experiments/ppo.py train_levels=l eval_env_size=l env_size=l train_levels.train_levels_list='["s/h2_one_wheel_car","l/h11_obstacle_avoidance"]'
+```
+# 🔎 See Also
+- 🌐 [Kinetix.js](https://github.com/Michael-Beukman/Kinetix.js) Kinetix reimplemented in Javascript, with a live demo [here](https://kinetix-env.github.io/gallery.html?editor=true).
+- 🍎 [Jax2D](https://github.com/MichaelTMatthews/Jax2D) The physics engine we made for Kinetix.
+- 👨‍💻 [JaxGL](https://github.com/FLAIROx/JaxGL) The graphics library we made for Kinetix.
+- 📋 [Our Paper](https://arxiv.org/abs/2410.23208) for more details and empirical results.
+# 📚 Citation
+Please cite Kinetix it as follows:
+```
+@article{matthews2024kinetix,
+      title={Kinetix: Investigating the Training of General Agents through Open-Ended Physics-Based Control Tasks},
+      author={Michael Matthews and Michael Beukman and Chris Lu and Jakob Foerster},
+      year={2024},
+      eprint={2410.23208},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2410.23208},
+}
+```

Kinetix/configs/editor.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+defaults:
+  - env: entity
+  - env_size: l
+  - learning:
+    - ppo-base
+    - ppo-rnn
+  - misc: misc
+  - model:
+    - model-base
+    - model-transformer
+  - _self_
+seed: 0
+upscale: 2
+downscale: 1
+fps: 60
+debug: true
+env:
+  frame_skip: 1
+agent_taking_actions: false

Kinetix/configs/env/entity.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+env_name: "Kinetix-Entity-MultiDiscrete-v1"
+dense_reward_scale: 2.0
+frame_skip: 2

Kinetix/configs/env/symbolic.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+env_name: "Kinetix-Symbolic-MultiDiscrete-v1"
+dense_reward_scale: 2.0
+frame_skip: 2

Kinetix/configs/env_size/custom.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+custom_path: worlds/l/grasp_easy.json
+env_size_type: custom
+env_size_name: custom

Kinetix/configs/env_size/l.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+num_polygons: 12
+num_circles: 4
+num_joints: 6
+num_thrusters: 2
+env_size_name: l
+num_motor_bindings: 4
+num_thruster_bindings: 2
+env_size_type: predefined

Kinetix/configs/env_size/m.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+num_polygons: 6
+num_circles: 3
+num_joints: 2
+num_thrusters: 2
+env_size_name: m
+num_motor_bindings: 4
+num_thruster_bindings: 2
+env_size_type: predefined

Kinetix/configs/env_size/s.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+num_polygons: 5
+num_circles: 2
+num_joints: 1
+num_thrusters: 1
+env_size_name: s
+num_motor_bindings: 4
+num_thruster_bindings: 2
+env_size_type: predefined

Kinetix/configs/eval/eval_all.yaml ADDED Viewed

	@@ -0,0 +1,82 @@

+eval_levels:
+  [
+    "s/h0_weak_thrust",
+    "s/h7_unicycle_left",
+    "s/h3_point_the_thruster",
+    "s/h4_thrust_aim",
+    "s/h1_thrust_over_ball",
+    "s/h5_rotate_fall",
+    "s/h9_explode_then_thrust_over",
+    "s/h6_unicycle_right",
+    "s/h8_unicycle_balance",
+    "s/h2_one_wheel_car",
+    "m/h0_unicycle",
+    "m/h1_car_left",
+    "m/h2_car_right",
+    "m/h3_car_thrust",
+    "m/h4_thrust_the_needle",
+    "m/h5_angry_birds",
+    "m/h6_thrust_over",
+    "m/h7_car_flip",
+    "m/h8_weird_vehicle",
+    "m/h9_spin_the_right_way",
+    "m/h10_thrust_right_easy",
+    "m/h11_thrust_left_easy",
+    "m/h12_thrustfall_left",
+    "m/h13_thrustfall_right",
+    "m/h14_thrustblock",
+    "m/h15_thrustshoot",
+    "m/h16_thrustcontrol_right",
+    "m/h17_thrustcontrol_left",
+    "m/h18_thrust_right_very_easy",
+    "m/h19_thrust_left_very_easy",
+    "m/arm_left",
+    "m/arm_right",
+    "m/arm_up",
+    "m/arm_hard",
+    "l/h0_angrybirds",
+    "l/h1_car_left",
+    "l/h2_car_ramp",
+    "l/h3_car_right",
+    "l/h4_cartpole",
+    "l/h5_flappy_bird",
+    "l/h6_lorry",
+    "l/h7_maze_1",
+    "l/h8_maze_2",
+    "l/h9_morph_direction",
+    "l/h10_morph_direction_2",
+    "l/h11_obstacle_avoidance",
+    "l/h12_platformer_1",
+    "l/h13_platformer_2",
+    "l/h14_simple_thruster",
+    "l/h15_swing_up",
+    "l/h16_thruster_goal",
+    "l/h17_unicycle",
+    "l/hard_beam_balance",
+    "l/hard_cartpole_thrust",
+    "l/hard_cartpole_wheels",
+    "l/hard_lunar_lander",
+    "l/hard_pinball",
+    "l/grasp_hard",
+    "l/grasp_easy",
+    "l/mjc_half_cheetah",
+    "l/mjc_half_cheetah_easy",
+    "l/mjc_hopper",
+    "l/mjc_hopper_easy",
+    "l/mjc_swimmer",
+    "l/mjc_walker",
+    "l/mjc_walker_easy",
+    "l/car_launch",
+    "l/car_swing_around",
+    "l/chain_lander",
+    "l/chain_thrust",
+    "l/gears",
+    "l/lever_puzzle",
+    "l/pr",
+    "l/rail",
+  ]
+eval_num_attempts: 10
+eval_freq: 10
+EVAL_ON_SAMPLED: false

Kinetix/configs/eval/eval_auto.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+eval_levels: "auto"
+eval_num_attempts: 10
+eval_freq: 10
+EVAL_ON_SAMPLED: false

Kinetix/configs/eval/eval_general.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+eval_levels:
+  [
+    "easy.simple_thruster",
+  ]
+eval_num_attempts: 10
+eval_freq: 10
+EVAL_ON_SAMPLED: false

Kinetix/configs/eval/l.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+eval_levels:
+  [
+    "l/h0_angrybirds",
+    "l/h1_car_left",
+    "l/h2_car_ramp",
+    "l/h3_car_right",
+    "l/h4_cartpole",
+    "l/h5_flappy_bird",
+    "l/h6_lorry",
+    "l/h7_maze_1",
+    "l/h8_maze_2",
+    "l/h9_morph_direction",
+    "l/h10_morph_direction_2",
+    "l/h11_obstacle_avoidance",
+    "l/h12_platformer_1",
+    "l/h13_platformer_2",
+    "l/h14_simple_thruster",
+    "l/h15_swing_up",
+    "l/h16_thruster_goal",
+    "l/h17_unicycle",
+    "l/hard_beam_balance",
+    "l/hard_cartpole_thrust",
+    "l/hard_cartpole_wheels",
+    "l/hard_lunar_lander",
+    "l/hard_pinball",
+    "l/grasp_hard",
+    "l/grasp_easy",
+    "l/mjc_half_cheetah",
+    "l/mjc_half_cheetah_easy",
+    "l/mjc_hopper",
+    "l/mjc_hopper_easy",
+    "l/mjc_swimmer",
+    "l/mjc_walker",
+    "l/mjc_walker_easy",
+    "l/car_launch",
+    "l/car_swing_around",
+    "l/chain_lander",
+    "l/chain_thrust",
+    "l/gears",
+    "l/lever_puzzle",
+    "l/pr",
+    "l/rail",
+  ]
+eval_num_attempts: 10
+eval_freq: 50
+EVAL_ON_SAMPLED: true

Kinetix/configs/eval/m.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+eval_levels:
+  [
+    "m/h0_unicycle",
+    "m/h1_car_left",
+    "m/h2_car_right",
+    "m/h3_car_thrust",
+    "m/h4_thrust_the_needle",
+    "m/h5_angry_birds",
+    "m/h6_thrust_over",
+    "m/h7_car_flip",
+    "m/h8_weird_vehicle",
+    "m/h9_spin_the_right_way",
+    "m/h10_thrust_right_easy",
+    "m/h11_thrust_left_easy",
+    "m/h12_thrustfall_left",
+    "m/h13_thrustfall_right",
+    "m/h14_thrustblock",
+    "m/h15_thrustshoot",
+    "m/h16_thrustcontrol_right",
+    "m/h17_thrustcontrol_left",
+    "m/h18_thrust_right_very_easy",
+    "m/h19_thrust_left_very_easy",
+    "m/arm_left",
+    "m/arm_right",
+    "m/arm_up",
+    "m/arm_hard",
+  ]
+eval_num_attempts: 10
+eval_freq: 50
+EVAL_ON_SAMPLED: true

Kinetix/configs/eval/mujoco.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+eval_levels:
+  [
+        "l/mjc_half_cheetah",
+        "l/mjc_half_cheetah_easy",
+        "l/mjc_hopper",
+        "l/mjc_hopper_easy",
+        "l/mjc_swimmer",
+        "l/mjc_walker",
+        "l/mjc_walker_easy",
+  ]
+eval_num_attempts: 10
+eval_freq: 10
+EVAL_ON_SAMPLED: false

Kinetix/configs/eval/s.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+eval_levels:
+  [
+    "s/h0_weak_thrust",
+    "s/h7_unicycle_left",
+    "s/h3_point_the_thruster",
+    "s/h4_thrust_aim",
+    "s/h1_thrust_over_ball",
+    "s/h5_rotate_fall",
+    "s/h9_explode_then_thrust_over",
+    "s/h6_unicycle_right",
+    "s/h8_unicycle_balance",
+    "s/h2_one_wheel_car",
+  ]
+eval_num_attempts: 10
+eval_freq: 50
+EVAL_ON_SAMPLED: true

Kinetix/configs/eval_env_size/l.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+num_polygons: 12
+num_circles: 4
+num_joints: 6
+num_thrusters: 2
+env_size_name: l
+num_motor_bindings: 4
+num_thruster_bindings: 2

Kinetix/configs/eval_env_size/m.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+num_polygons: 6
+num_circles: 3
+num_joints: 2
+num_thrusters: 2
+env_size_name: m
+num_motor_bindings: 4
+num_thruster_bindings: 2

Kinetix/configs/eval_env_size/s.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+num_polygons: 5
+num_circles: 2
+num_joints: 1
+num_thrusters: 1
+env_size_name: s
+num_motor_bindings: 4
+num_thruster_bindings: 2

Kinetix/configs/learning/ppo-base.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+lr: 5e-5
+peak_lr: 3e-4
+initial_lr: 1e-5
+warmup_frac: 0.1
+max_grad_norm: 1.0
+total_timesteps: 1073741824
+num_train_envs: 2048
+num_minibatches: 32
+gamma: 0.995
+update_epochs: 8
+clip_eps: 0.2
+gae_lambda: 0.9
+ent_coef: 0.01
+anneal_lr: false
+warmup_lr: false
+vf_coef: 0.5
+permute_state_during_training: false
+filter_levels: true
+level_filter_n_steps: 64
+level_filter_sample_ratio: 2

Kinetix/configs/learning/ppo-rnn.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ num_steps: 64
2	+ num_repeats: 1

Kinetix/configs/learning/ppo-sfl.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ num_steps: 512

Kinetix/configs/learning/ppo-ued.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ num_steps: 64
2	+ outer_rollout_steps: 4

Kinetix/configs/misc/misc.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+group: "auto"
+group_auto_prefix: ""
+save_path: "checkpoints/kinetix"
+use_wandb: true
+save_policy: true
+wandb_project: "kinetix-experiments"
+wandb_entity: null
+wandb_mode : online
+video_frequency: 10
+load_from_checkpoint: null
+load_only_params: true
+checkpoint_save_freq: 512
+checkpoint_human_numbers: false
+load_legacy_checkpoint: false
+load_train_levels_legacy: false
+economical_saving: false

Kinetix/configs/model/model-base.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+fc_layer_depth: 5
+fc_layer_width: 128
+activation: "tanh"
+recurrent_model: False

Kinetix/configs/model/model-transformer.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+transformer_depth: 2
+transformer_size: 16
+transformer_encoder_size: 128
+num_heads: 8
+full_attention_mask: false
+aggregate_mode: dummy_and_mean

Kinetix/configs/plr.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+defaults:
+  - env: entity
+  - learning:
+    - ppo-base
+    - ppo-ued
+  - misc: misc
+  - env_size: s
+  - eval: s
+  - eval_env_size: s
+  - ued: plr
+  - train_levels: random
+  - model:
+    - model-base
+    - model-transformer
+  - _self_
+seed: 0

Kinetix/configs/ppo.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+defaults:
+  - env: entity
+  - env_size: s
+  - learning:
+    - ppo-base
+    - ppo-rnn
+  - misc: misc
+  - eval: s
+  - eval_env_size: s
+  - train_levels: random
+  - model:
+    - model-base
+    - model-transformer
+  - _self_
+eval:
+  eval_freq: 40
+seed: 0

Kinetix/configs/sfl.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+defaults:
+  - env: entity
+  - learning:
+    - ppo-base
+    - ppo-rnn
+  - misc: misc
+  - ued: sfl
+  - env_size: s
+  - eval: s
+  - eval_env_size: s
+  - train_levels: random
+  - model:
+    - model-base
+    - model-transformer
+  - _self_
+eval:
+  eval_freq: 128
+learning:
+  num_steps: 256
+seed: 0

Kinetix/configs/train_levels/l.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+train_level_mode: list
+train_levels_list:
+  [
+    "l/h0_angrybirds",
+    "l/h1_car_left",
+    "l/h2_car_ramp",
+    "l/h3_car_right",
+    "l/h4_cartpole",
+    "l/h5_flappy_bird",
+    "l/h6_lorry",
+    "l/h7_maze_1",
+    "l/h8_maze_2",
+    "l/h9_morph_direction",
+    "l/h10_morph_direction_2",
+    "l/h11_obstacle_avoidance",
+    "l/h12_platformer_1",
+    "l/h13_platformer_2",
+    "l/h14_simple_thruster",
+    "l/h15_swing_up",
+    "l/h16_thruster_goal",
+    "l/h17_unicycle",
+    "l/hard_beam_balance",
+    "l/hard_cartpole_thrust",
+    "l/hard_cartpole_wheels",
+    "l/hard_lunar_lander",
+    "l/hard_pinball",
+    "l/mjc_half_cheetah",
+    "l/mjc_half_cheetah_easy",
+    "l/mjc_hopper",
+    "l/mjc_hopper_easy",
+    "l/mjc_swimmer",
+    "l/mjc_walker",
+    "l/mjc_walker_easy",
+    "l/grasp_hard",
+    "l/grasp_easy",
+    "l/car_launch",
+    "l/car_swing_around",
+    "l/chain_lander",
+    "l/chain_thrust",
+    "l/gears",
+    "l/lever_puzzle",
+    "l/pr",
+    "l/rail",
+  ]

Kinetix/configs/train_levels/m.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+train_level_mode: list
+train_levels_list:
+  [
+    "m/h0_unicycle",
+    "m/h1_car_left",
+    "m/h2_car_right",
+    "m/h3_car_thrust",
+    "m/h4_thrust_the_needle",
+    "m/h5_angry_birds",
+    "m/h6_thrust_over",
+    "m/h7_car_flip",
+    "m/h8_weird_vehicle",
+    "m/h9_spin_the_right_way",
+    "m/h10_thrust_right_easy",
+    "m/h11_thrust_left_easy",
+    "m/h12_thrustfall_left",
+    "m/h13_thrustfall_right",
+    "m/h14_thrustblock",
+    "m/h15_thrustshoot",
+    "m/h16_thrustcontrol_right",
+    "m/h17_thrustcontrol_left",
+    "m/h18_thrust_right_very_easy",
+    "m/h19_thrust_left_very_easy",
+    "m/arm_left",
+    "m/arm_right",
+    "m/arm_up",
+    "m/arm_hard",
+  ]

Kinetix/configs/train_levels/mujoco.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+train_level_mode: list
+train_levels_list:
+  [
+        "l/mjc_half_cheetah",
+        "l/mjc_half_cheetah_easy",
+        "l/mjc_hopper",
+        "l/mjc_hopper_easy",
+        "l/mjc_swimmer",
+        "l/mjc_walker",
+        "l/mjc_walker_easy",
+  ]

Kinetix/configs/train_levels/random.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ train_level_mode: random
2	+ train_level_distribution: distribution_v3

Kinetix/configs/train_levels/s.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+train_level_mode: list
+train_levels_list:
+  [
+    "s/h0_weak_thrust",
+    "s/h7_unicycle_left",
+    "s/h3_point_the_thruster",
+    "s/h4_thrust_aim",
+    "s/h1_thrust_over_ball",
+    "s/h5_rotate_fall",
+    "s/h9_explode_then_thrust_over",
+    "s/h6_unicycle_right",
+    "s/h8_unicycle_balance",
+    "s/h2_one_wheel_car",
+  ]

Kinetix/configs/train_levels/train_all.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+train_level_mode: list
+train_levels_list:
+  [
+    "s/h0_weak_thrust",
+    "s/h7_unicycle_left",
+    "s/h3_point_the_thruster",
+    "s/h4_thrust_aim",
+    "s/h1_thrust_over_ball",
+    "s/h5_rotate_fall",
+    "s/h9_explode_then_thrust_over",
+    "s/h6_unicycle_right",
+    "s/h8_unicycle_balance",
+    "s/h2_one_wheel_car",
+    "m/h0_unicycle",
+    "m/h1_car_left",
+    "m/h2_car_right",
+    "m/h3_car_thrust",
+    "m/h4_thrust_the_needle",
+    "m/h5_angry_birds",
+    "m/h6_thrust_over",
+    "m/h7_car_flip",
+    "m/h8_weird_vehicle",
+    "m/h9_spin_the_right_way",
+    "m/h10_thrust_right_easy",
+    "m/h11_thrust_left_easy",
+    "m/h12_thrustfall_left",
+    "m/h13_thrustfall_right",
+    "m/h14_thrustblock",
+    "m/h15_thrustshoot",
+    "m/h16_thrustcontrol_right",
+    "m/h17_thrustcontrol_left",
+    "m/h18_thrust_right_very_easy",
+    "m/h19_thrust_left_very_easy",
+    "m/arm_left",
+    "m/arm_right",
+    "m/arm_up",
+    "m/arm_hard",
+    "l/h0_angrybirds",
+    "l/h1_car_left",
+    "l/h2_car_ramp",
+    "l/h3_car_right",
+    "l/h4_cartpole",
+    "l/h5_flappy_bird",
+    "l/h6_lorry",
+    "l/h7_maze_1",
+    "l/h8_maze_2",
+    "l/h9_morph_direction",
+    "l/h10_morph_direction_2",
+    "l/h11_obstacle_avoidance",
+    "l/h12_platformer_1",
+    "l/h13_platformer_2",
+    "l/h14_simple_thruster",
+    "l/h15_swing_up",
+    "l/h16_thruster_goal",
+    "l/h17_unicycle",
+    "l/hard_beam_balance",
+    "l/hard_cartpole_thrust",
+    "l/hard_cartpole_wheels",
+    "l/hard_lunar_lander",
+    "l/hard_pinball",
+    "l/grasp_hard",
+    "l/grasp_easy",
+    "l/mjc_half_cheetah",
+    "l/mjc_half_cheetah_easy",
+    "l/mjc_hopper",
+    "l/mjc_hopper_easy",
+    "l/mjc_swimmer",
+    "l/mjc_walker",
+    "l/mjc_walker_easy",
+    "l/car_launch",
+    "l/car_swing_around",
+    "l/chain_lander",
+    "l/chain_thrust",
+    "l/gears",
+    "l/lever_puzzle",
+    "l/pr",
+    "l/rail",
+  ]

Kinetix/configs/ued/accel.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+use_accel: true
+exploratory_grad_updates: true
+num_edits: 5
+score_function: MaxMC
+level_buffer_capacity: 4000
+replay_prob: 0.5
+staleness_coeff: 0.3
+temperature: 1.0
+topk_k: 8
+minimum_fill_ratio: 0.5
+prioritization: rank
+buffer_duplicate_check: false
+buffer_train: false
+mode: train
+checkpoint_directory: checkpoints/physicsenv/ued
+max_number_of_checkpoints: 5

Kinetix/configs/ued/plr.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+use_accel: false
+exploratory_grad_updates: true
+num_edits: 2
+score_function: MaxMC
+level_buffer_capacity: 4000
+replay_prob: 0.5
+staleness_coeff: 0.3
+temperature: 1.0
+topk_k: 8
+minimum_fill_ratio: 0.5
+prioritization: rank
+buffer_duplicate_check: false
+buffer_train: false
+mode: train
+checkpoint_directory: checkpoints/physicsenv/ued
+max_number_of_checkpoints: 5
+accel_start_from_empty: True

Kinetix/configs/ued/sfl.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+"sampled_envs_ratio": 0.5
+"batch_size": 4096
+"num_batches": 3
+"rollout_steps": 512
+"num_to_save": 1024
+log_learnability_before_after: false
+put_eval_levels_in_buffer: false
+save_learnability_buffer_pickle: false

Kinetix/docs/README.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# Documentation
+This is intended to provide some more details about how Kinetix works, including more in-depth examples. If you are interested in the configuration options, see [here](./configs.md).
+- [Documentation](#documentation)
+  - [Different Versions of Kinetix Environments](#different-versions-of-kinetix-environments)
+    - [Action Spaces](#action-spaces)
+    - [Observation Spaces](#observation-spaces)
+  - [Resetting Functionality](#resetting-functionality)
+  - [Using Kinetix to easily design your own JAX Environments](#using-kinetix-to-easily-design-your-own-jax-environments)
+    - [Step 1 - Design an Environment](#step-1---design-an-environment)
+    - [Step 2 - Export It](#step-2---export-it)
+    - [Step 3 - Import It](#step-3---import-it)
+    - [Step 4 - Train](#step-4---train)
+## Different Versions of Kinetix Environments
+We provide several different variations on the standard Kinetix environment, where the primary difference is the action and observation spaces.
+Each of the environments has a different name, of the following form: `Kinetix-<OBS>-<ACTION>-v1`, and can be made using the `make_kinetix_env_from_name` helper function.
+### Action Spaces
+For all action spaces, the agent can control joints and thrusters. Joints have a property `motor_binding`, which is a way to tie different joints to the same action. Two joints that have the same binding will always perform the same action, likewise for thrusters.
+We have three observation spaces, discrete, continuous and multi-discrete (which is the default).
+- **Discrete** has `2 * num_motor_bindings + num_thruster_bindings + 1` options, one of which can be active at any time. There are two options for every joint, i.e., backward and forward at full power. There is one option for each thruster, to activate it at full power. The final option is a no-op, meaning that no torque or force is applied to joints/thrusters.
+- **Continuous** has shape `num_motor_bindings + num_thruster_bindings`, where each motor element can take a value between -1 and 1, and thruster elements can take values between 0 and 1.
+- **Multi-Discrete**: This is a discrete action space, but allows multiple joints and thrusters to be active at any one time. The agent must output a flat vector of size `3 * num_motor_bindings + 2 * num_thruster_bindings`. For joints, each group of three represents a categorical distribution of `[0, -1, +1]` and for thrusters it represents `[0, +1]`.
+### Observation Spaces
+We provide three primary observation spaces, Symbolic-Flat (called just symbolic), Symbolic-Entity (called entity, which is also the default) and Pixels.
+- **Symbolic-Flat** returns a large vector, which is the flattened representation of all shapes and their properties.
+- **Symbolic-Entity** also returns a vector representation of all entities, but does not flatten it, instead returning it in a form that can be used with permutation-invariant network architectures, such as transformers.
+- **Pixels** returns an image representation of the scene. This is partially observable, as features such as the restitution and density of shapes is not shown.
+Each observation space has its own pros and cons. **Symbolic-Flat** is the fastest by far, but has two clear downsides. First, it is restricted to a single environment size, e.g. a model trained on `small` cannot be run on `medium` levels. Second, due to the large number of symmetries (e.g. any permutation of the same shapes would represent the same scene but would look very different in this observation space), this generalises worse than *entity*.
+**Symbolic-Entity** is faster than pixels, but slower than Symbolic-Flat. However, it can be applied to any number of shapes, and is natively permutation invariant. For these reasons we chose it as the default option.
+Finally, **Pixels** runs the slowest, and also requires more memory, which means that we cannot run as many parallel environments. However, pixels is potentially the most general format, and could theoretically allow transfer to other domains and simulators.
+## Resetting Functionality
+We have two primary resetting functions that control the environment's behaviour when an episode ends. The first of these is to train on a known, predefined set of levels, and resetting samples a new level from this set. In the extreme case, this also allows training only on a single level in the standard RL manner. The other main way of resetting is to sample a *random* level from some distribution, meaning that it is exceedingly unlikely to sample the same level twice.
+## Using Kinetix to easily design your own JAX Environments
+Since Kinetix has a general physics engine, you can design your own environments and train RL agents on them very fast! This section in the docs describes this pipeline.
+### Step 1 - Design an Environment
+You can go to our [online editor](https://kinetix-env.github.io/gallery.html?editor=true). You can also have a look at the [gallery](https://kinetix-env.github.io/gallery.html) if you need some inspiration.
+The following two images show the main editor page, and then the level I designed, where you have to spin the ball the right way. While designing the level, you can play it to test it out, seeing if it is possible and of the appropriate difficulty.
+<p align="middle">
+  <img src="../images/docs/edit-1.png" width="49%" />
+  <img src="../images/docs/edit-2.png" width="49%" />
+</p>
+### Step 2 - Export It
+Once you are satisfied with your level, you can download it as a json file by using the button on the bottom left. Once this is downloaded, move it to `$KINETIX_ROOT/worlds/custom/my_custom_level.json`, where `$KINETIX_ROOT` is the root of the Kinetix repo.
+### Step 3 - Import It
+In python, you can import the level as follows, see `examples/example_premade_level_replay.py` for an example.
+```python
+from kinetix.util.saving import load_from_json_file
+level, static_env_params, env_params = load_from_json_file("worlds/custom/my_custom_level.json")
+```
+### Step 4 - Train
+You can use the above if you want to import the level and play around with it. If you want to train an RL agent on this level, you can do the following (see [here](https://github.com/FLAIROx/Kinetix?tab=readme-ov-file#training-on-a-single-hand-designed-level) from the main README).
+```commandline
+python3 experiments/ppo.py env_size=custom \
+                           env_size.custom_path=custom/my_custom_level.json \
+                           train_levels=s \
+                           train_levels.train_levels_list='["custom/my_custom_level.json"]' \
+                           eval=eval_auto
+```
+And the agent will start training, with videos on this on [wandb](https://wandb.ai).
+<p align="middle">
+  <img src="../images/docs/wandb.gif" width="49%" />
+</p>

Kinetix/docs/configs.md ADDED Viewed

	@@ -0,0 +1,179 @@

+# Configuration
+- [Configuration](#configuration)
+  - [Configuration Headings](#configuration-headings)
+    - [Env](#env)
+    - [Env Size](#env-size)
+    - [Learning](#learning)
+    - [Misc](#misc)
+    - [Eval](#eval)
+    - [Eval Env Size](#eval-env-size)
+    - [Train Levels](#train-levels)
+    - [Model](#model)
+    - [UED](#ued)
+We use [hydra](hydra.cc) for all of our configurations, and we use [hierarchical configuration](https://hydra.cc/docs/tutorials/structured_config/schema/) to organise everything better.
+In particular, we have the following configuration headings, with the base `ppo` config looking like:
+```yaml
+defaults:
+  - env: entity
+  - env_size: s
+  - learning:
+    - ppo-base
+    - ppo-rnn
+  - misc: misc
+  - eval: s
+  - eval_env_size: s
+  - train_levels: random
+  - model:
+    - model-base
+    - model-transformer
+  - _self_
+seed: 0
+```
+## Configuration Headings
+### Env
+This controls the environment to be used.
+#### Preset Options
+We provide two options in `configs/env`, namely `entity` and `symbolic`; each of these can be used by running `python3 experiments/ppo.py env=symbolic` or `python3 experiments/ppo.py env=entity`. If you wish to customise the options further, you can add any of the following subkeys (e.g. by running `python3 experiments/ppo.py env=symbolic env.dense_reward_scale=0.0`):
+#### Individual Subkeys
+- `env.env_name`: The name of the environment, with controls the observation and action space.
+- `env.dense_reward_scale`: How large the dense reward scale is, set this to zero to disable dense rewards.
+- `env.frame_skip`: The number of frames to skip, setting this to 2 (the default) seems to perform better.
+### Env Size
+This controls the maximum number of shapes present in the simulation. This has two important tradeoffs, namely speed and representational power: Small environments run much faster but some complex environments require a large number of shapes. See `configs/env_size`
+#### Preset Options
+- `s`: The `small` preset
+- `m`: `Medium` preset
+- `l`: `Large` preset
+- `custom`: Allows the use of a custom environment size loaded from a json file (see [here](#train-levels) for more).
+#### Individual Subkeys
+- `num_polygons`: How many polygons
+- `num_circles`: How many circles
+- `num_joints`: How many joints
+- `num_thrusters`: How many thrusters
+- `env_size_name`: "s", "m" or "l"
+- `num_motor_bindings`: How many different joint bindings are there, meaning how many different actions are there associated with joints. All joints with the same binding will have the same action applied to them.
+- `num_thruster_bindings`: How many different thruster bindings are there
+- `env_size_type`: "predefined" or "custom"
+- `custom_path`: **Only for env_size_type=custom**, controls the json file to load the custom environment size from.
+### Learning
+This controls the agent's learning, see `configs/learning`
+#### Preset Options
+- `ppo-base`: This has all of the base PPO parameters, and is used by all methods
+- `ppo-rnn`: This has the PureJaxRL settings for some of PPO's hyperparameters (mainly `num_steps` is different)
+- `ppo-sfl`: This has the SFL-specific value of `num_steps`
+- `ppo-ued`: This has the JAXUED-specific `num_steps` and `outer_rollout_steps`
+#### Individual Subkeys
+- `lr`: Learning Rate
+- `anneal_lr`: Whether to anneal LR
+- `warmup_lr`:  Whether to warmup LR
+- `peak_lr`: If warming up, the peak
+- `initial_lr`: If warming up, the initial LR
+- `warmup_frac`: If warming up, the warmup fraction of training time
+- `max_grad_norm`: Maximum grad norm
+- `total_timesteps`: How many total environment interactions must be run
+- `num_train_envs`: Number of parallel environments to run simultaneously
+- `num_minibatches`: Minibatches for PPO learning
+- `gamma`:  Discount factor
+- `update_epochs`: PPO update epochs
+- `clip_eps`: PPO clipping epsilon
+- `gae_lambda`: PPO Lambda for GAE
+- `ent_coef`: Entropy loss coefficient
+- `vf_coef`: Value function loss coefficient
+- `permute_state_during_training`: If true, the state is permuted on every reset.
+- `filter_levels`: If true, and we are training on random levels, this filters out levels that can be solved by a no-op
+- `level_filter_n_steps`: How many steps to allocate to the no-op policy for filtering
+- `level_filter_sample_ratio`: How many more levels to sample than required (ideally `level_filter_sample_ratio` is more than the fraction that will be filtered out).
+- `num_steps`: PPO rollout length
+- `outer_rollout_steps`: How many learning steps to do for e.g. PLR for each rollout (see the [Craftax paper](https://arxiv.org/abs/2402.16801) for a more in-depth explanation).
+### Misc
+There are a plethora of miscellaneous options that are grouped under the `misc` category. There is only one preset option, `configs/misc/misc.yaml`.
+#### Individual Subkeys
+- `group`: Wandb group ("auto" usually works well)
+- `group_auto_prefix`: If using group=auto, this is a user-defined prefix
+- `save_path`: Where to save checkpoints to
+- `use_wandb`: Should wandb be logged to
+- `save_policy`: Should we save the policy
+- `wandb_project`: Wandb project
+- `wandb_entity`: Wandb entity, leave as `null` to use your default one
+- `wandb_mode` : Wandb mode
+- `video_frequency`: How often to log videos (they are quite large)
+- `load_from_checkpoint`: WWandb artifact path to load from
+- `load_only_params`: Whether to load just the network parameters or entire train state.
+- `checkpoint_save_freq`: How often to log checkpoits
+- `checkpoint_human_numbers`: Should the checkpoints have human-readable timestep numbers
+- `load_legacy_checkpoint`: Do not use
+- `load_train_levels_legacy`: Do not use
+- `economical_saving`: If true, only saves a few important checkpoints for space conservation purposes.
+### Eval
+This option (see `configs/eval`) controls how evaluation works, and what levels are used.
+#### Preset Options
+- `s`: Eval on the `s` hand-designed levels located in `worlds/s`
+- `m`: Eval on the `m` hand-designed levels located in `worlds/m`
+- `l`: Eval on the `l` hand-designed levels located in `worlds/l`
+- `eval_all`: Eval on all of the hand-designed eval levels
+- `eval_auto`: If `train_levels` is not random, evaluate on the training levels.
+- `mujoco`: Eval on the recreations of the mujoco tasks.
+- `eval_general`: General option if you are planning on overwriting most options.
+#### Individual Subkeys
+- `eval_levels`: List of eval levels or the string "auto"
+- `eval_num_attempts`: How many times to eval on the same level
+- `eval_freq`: How often to evaluate
+- `EVAL_ON_SAMPLED`: If true, in `plr.py` and `sfl.py`, evaluates on a fixed set of randomly-generated levels
+### Eval Env Size
+This controls the size of the evaluation environment. This is crucial to match up with the size of the evaluation levels.
+#### Preset Options
+- `s`: Same as the `env_size` option.
+- `m`: Same as the `env_size` option.
+- `l`: Same as the `env_size` option.
+### Train Levels
+Which levels to train on.
+#### Preset Options
+- `s`: All of the `s` holdout levels
+- `m`: All of the `m` holdout levels
+- `l`: All of the `l` holdout levels
+- `train_all`: All of the levels from all 3 holdout sets
+- `mujoco`: All of the mujoco recreation levels.
+- `random`: Train on random levels
+#### Individual Subkeys
+- `train_level_mode`: "random" or "list"
+- `train_level_distribution`: if train_level_mode=random, this controls which distribution to use. By default `distribution_v3`
+- `train_levels_list`: This is a list of levels to train on.
+### Model
+This controls the model architecture and options associated with that.
+#### Preset Options
+We use both of the following:
+- `model-base`
+- `model-entity`
+#### Individual Subkeys
+`fc_layer_depth`: How many layers in the FC model
+`fc_layer_width`: How wide is each FC layer
+`activation`: NN activation
+`recurrent_model`: Whether or not to use recurrence
+The following are just relevant when using `env=entity`
+`transformer_depth`: How many transformer layers to use
+`transformer_size`: How large are the KQV vectors
+`transformer_encoder_size`: How large are the initial embeddings
+`num_heads`: How many heads, must be a multiple of 4 and divide `transformer_size` evenly.
+`full_attention_mask`: If true, all heads use the full attention mask
+`aggregate_mode`: `dummy_and_mean` works well.
+### UED
+Options pertaining to UED (i.e., when using the scripts `plr.py` or `sfl.py`)
+#### Preset Options
+- `sfl`
+- `plr`
+- `accel`
+#### Individual Subkeys
+See the individual files for the configuration options used.
+For SFL, we have:
+- `sampled_envs_ratio`: How many environments are from the SFL buffer and how many are randomly generated
+- `batch_size`: How many levels to evaluate learnability on per batch
+- `num_batches`: How many batches to run when choosing the most learnable levels
+- `rollout_steps`: How many steps to rollout for when doing the learnability calculation.
+- `num_to_save`: How many levels to save in the learnability buffer

Kinetix/examples/example_premade_level_replay.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import jax
+import jax.numpy as jnp
+import jax.random
+from jax2d.engine import PhysicsEngine
+from matplotlib import pyplot as plt
+from kinetix.environment.env import make_kinetix_env_from_args
+from kinetix.environment.env_state import StaticEnvParams, EnvParams
+from kinetix.environment.ued.distributions import sample_kinetix_level
+from kinetix.environment.ued.ued_state import UEDParams
+from kinetix.render.renderer_pixels import make_render_pixels
+from kinetix.util.saving import load_from_json_file
+def main():
+    # Load a premade level
+    level, static_env_params, env_params = load_from_json_file("worlds/l/grasp_easy.json")
+    # Create the environment
+    env = make_kinetix_env_from_args(
+        obs_type="pixels", action_type="continuous", reset_type="replay", static_env_params=static_env_params
+    )
+    # Reset the environment state to this level
+    rng = jax.random.PRNGKey(0)
+    rng, _rng = jax.random.split(rng)
+    obs, env_state = env.reset_to_level(_rng, level, env_params)
+    # Take a step in the environment
+    rng, _rng = jax.random.split(rng)
+    action = env.action_space(env_params).sample(_rng)
+    rng, _rng = jax.random.split(rng)
+    obs, env_state, reward, done, info = env.step(_rng, env_state, action, env_params)
+    # Render environment
+    renderer = make_render_pixels(env_params, static_env_params)
+    # There are a lot of wrappers
+    pixels = renderer(env_state.env_state.env_state.env_state)
+    plt.imshow(pixels.astype(jnp.uint8).transpose(1, 0, 2)[::-1])
+    plt.show()
+if __name__ == "__main__":
+    main()

Kinetix/examples/example_random_level_replay.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import jax
+import jax.numpy as jnp
+import jax.random
+from jax2d.engine import PhysicsEngine
+from matplotlib import pyplot as plt
+from kinetix.environment.env import make_kinetix_env_from_args
+from kinetix.environment.env_state import StaticEnvParams, EnvParams
+from kinetix.environment.ued.distributions import sample_kinetix_level
+from kinetix.environment.ued.ued_state import UEDParams
+from kinetix.render.renderer_pixels import make_render_pixels
+def main():
+    # Use default parameters
+    env_params = EnvParams()
+    static_env_params = StaticEnvParams()
+    ued_params = UEDParams()
+    # Create the environment
+    env = make_kinetix_env_from_args(
+        obs_type="pixels", action_type="continuous", reset_type="replay", static_env_params=static_env_params
+    )
+    # Sample a random level
+    rng = jax.random.PRNGKey(0)
+    rng, _rng = jax.random.split(rng)
+    level = sample_kinetix_level(_rng, env.physics_engine, env_params, static_env_params, ued_params)
+    # Reset the environment state to this level
+    rng, _rng = jax.random.split(rng)
+    obs, env_state = env.reset_to_level(_rng, level, env_params)
+    # Take a step in the environment
+    rng, _rng = jax.random.split(rng)
+    action = env.action_space(env_params).sample(_rng)
+    rng, _rng = jax.random.split(rng)
+    obs, env_state, reward, done, info = env.step(_rng, env_state, action, env_params)
+    # Render environment
+    renderer = make_render_pixels(env_params, static_env_params)
+    # There are a lot of wrappers
+    pixels = renderer(env_state.env_state.env_state.env_state)
+    plt.imshow(pixels.astype(jnp.uint8).transpose(1, 0, 2)[::-1])
+    plt.show()
+if __name__ == "__main__":
+    main()

Kinetix/experiments/plr.py ADDED Viewed

	@@ -0,0 +1,1143 @@

+from functools import partial
+import time
+from enum import IntEnum
+from typing import Tuple
+import chex
+import hydra
+import jax
+import jax.numpy as jnp
+import numpy as np
+from omegaconf import OmegaConf
+import optax
+from flax import core, struct
+from flax.training.train_state import TrainState as BaseTrainState
+import wandb
+from kinetix.environment.ued.distributions import (
+    create_random_starting_distribution,
+)
+from kinetix.environment.ued.ued import (
+    make_mutate_env,
+    make_reset_train_function_with_mutations,
+    make_vmapped_filtered_level_sampler,
+)
+from kinetix.environment.ued.ued import (
+    make_mutate_env,
+    make_reset_train_function_with_list_of_levels,
+    make_reset_train_function_with_mutations,
+)
+from kinetix.util.config import (
+    generate_ued_params_from_config,
+    get_video_frequency,
+    init_wandb,
+    normalise_config,
+    save_data_to_local_file,
+    generate_params_from_config,
+    get_eval_level_groups,
+)
+from jaxued.environments.underspecified_env import EnvState
+from jaxued.level_sampler import LevelSampler
+from jaxued.utils import compute_max_returns, max_mc, positive_value_loss
+from flax.serialization import to_state_dict
+import sys
+sys.path.append("experiments")
+from kinetix.environment.env import make_kinetix_env_from_name
+from kinetix.environment.env_state import StaticEnvParams
+from kinetix.environment.wrappers import (
+    UnderspecifiedToGymnaxWrapper,
+    LogWrapper,
+    DenseRewardWrapper,
+    AutoReplayWrapper,
+)
+from kinetix.models import make_network_from_config
+from kinetix.render.renderer_pixels import make_render_pixels
+from kinetix.models.actor_critic import ScannedRNN
+from kinetix.util.learning import (
+    general_eval,
+    get_eval_levels,
+    no_op_and_random_rollout,
+    sample_trajectories_and_learn,
+)
+from kinetix.util.saving import (
+    load_train_state_from_wandb_artifact_path,
+    save_model_to_wandb,
+)
+class UpdateState(IntEnum):
+    DR = 0
+    REPLAY = 1
+    MUTATE = 2
+def get_level_complexity_metrics(all_levels: EnvState, static_env_params: StaticEnvParams):
+    def get_for_single_level(level):
+        return {
+            "complexity/num_shapes": level.polygon.active[static_env_params.num_static_fixated_polys :].sum()
+            + level.circle.active.sum(),
+            "complexity/num_joints": level.joint.active.sum(),
+            "complexity/num_thrusters": level.thruster.active.sum(),
+            "complexity/num_rjoints": (level.joint.active * jnp.logical_not(level.joint.is_fixed_joint)).sum(),
+            "complexity/num_fjoints": (level.joint.active * (level.joint.is_fixed_joint)).sum(),
+            "complexity/has_ball": ((level.polygon_shape_roles == 1) * level.polygon.active).sum()
+            + ((level.circle_shape_roles == 1) * level.circle.active).sum(),
+            "complexity/has_goal": ((level.polygon_shape_roles == 2) * level.polygon.active).sum()
+            + ((level.circle_shape_roles == 2) * level.circle.active).sum(),
+        }
+    return jax.tree.map(lambda x: x.mean(), jax.vmap(get_for_single_level)(all_levels))
+def get_ued_score_metrics(all_ued_scores):
+    (mc, pvl, learn) = all_ued_scores
+    scores = {}
+    for score, name in zip([mc, pvl, learn], ["MaxMC", "PVL", "Learnability"]):
+        scores[f"ued_scores/{name}/Mean"] = score.mean()
+        scores[f"ued_scores_additional/{name}/Max"] = score.max()
+        scores[f"ued_scores_additional/{name}/Min"] = score.min()
+    return scores
+class TrainState(BaseTrainState):
+    sampler: core.FrozenDict[str, chex.ArrayTree] = struct.field(pytree_node=True)
+    update_state: UpdateState = struct.field(pytree_node=True)
+    # === Below is used for logging ===
+    num_dr_updates: int
+    num_replay_updates: int
+    num_mutation_updates: int
+    dr_last_level_batch_scores: chex.ArrayTree = struct.field(pytree_node=True)
+    replay_last_level_batch_scores: chex.ArrayTree = struct.field(pytree_node=True)
+    mutation_last_level_batch_scores: chex.ArrayTree = struct.field(pytree_node=True)
+    dr_last_level_batch: chex.ArrayTree = struct.field(pytree_node=True)
+    replay_last_level_batch: chex.ArrayTree = struct.field(pytree_node=True)
+    mutation_last_level_batch: chex.ArrayTree = struct.field(pytree_node=True)
+    dr_last_rollout_batch: chex.ArrayTree = struct.field(pytree_node=True)
+    replay_last_rollout_batch: chex.ArrayTree = struct.field(pytree_node=True)
+    mutation_last_rollout_batch: chex.ArrayTree = struct.field(pytree_node=True)
+# region PPO helper functions
+# endregion
+def train_state_to_log_dict(train_state: TrainState, level_sampler: LevelSampler) -> dict:
+    """To prevent the entire (large) train_state to be copied to the CPU when doing logging, this function returns all of the important information in a dictionary format.
+        Anything in the `log` key will be logged to wandb.
+    Args:
+        train_state (TrainState):
+        level_sampler (LevelSampler):
+    Returns:
+        dict:
+    """
+    sampler = train_state.sampler
+    idx = jnp.arange(level_sampler.capacity) < sampler["size"]
+    s = jnp.maximum(idx.sum(), 1)
+    return {
+        "log": {
+            "level_sampler/size": sampler["size"],
+            "level_sampler/episode_count": sampler["episode_count"],
+            "level_sampler/max_score": sampler["scores"].max(),
+            "level_sampler/weighted_score": (sampler["scores"] * level_sampler.level_weights(sampler)).sum(),
+            "level_sampler/mean_score": (sampler["scores"] * idx).sum() / s,
+        },
+        "info": {
+            "num_dr_updates": train_state.num_dr_updates,
+            "num_replay_updates": train_state.num_replay_updates,
+            "num_mutation_updates": train_state.num_mutation_updates,
+        },
+    }
+def compute_learnability(config, done, reward, info, num_envs):
+    num_agents = 1
+    BATCH_ACTORS = num_envs * num_agents
+    rollout_length = config["num_steps"] * config["outer_rollout_steps"]
+    @partial(jax.vmap, in_axes=(None, 1, 1, 1))
+    @partial(jax.jit, static_argnums=(0,))
+    def _calc_outcomes_by_agent(max_steps: int, dones, returns, info):
+        idxs = jnp.arange(max_steps)
+        @partial(jax.vmap, in_axes=(0, 0))
+        def __ep_outcomes(start_idx, end_idx):
+            mask = (idxs > start_idx) & (idxs <= end_idx) & (end_idx != max_steps)
+            r = jnp.sum(returns * mask)
+            goal_r = info["GoalR"]
+            success = jnp.sum(goal_r * mask)
+            collision = 0
+            timeo = 0
+            l = end_idx - start_idx
+            return r, success, collision, timeo, l
+        done_idxs = jnp.argwhere(dones, size=50, fill_value=max_steps).squeeze()
+        mask_done = jnp.where(done_idxs == max_steps, 0, 1)
+        ep_return, success, collision, timeo, length = __ep_outcomes(
+            jnp.concatenate([jnp.array([-1]), done_idxs[:-1]]), done_idxs
+        )
+        return {
+            "ep_return": ep_return.mean(where=mask_done),
+            "num_episodes": mask_done.sum(),
+            "num_success": success.sum(where=mask_done),
+            "success_rate": success.mean(where=mask_done),
+            "collision_rate": collision.mean(where=mask_done),
+            "timeout_rate": timeo.mean(where=mask_done),
+            "ep_len": length.mean(where=mask_done),
+        }
+    done_by_env = done.reshape((-1, num_agents, num_envs))
+    reward_by_env = reward.reshape((-1, num_agents, num_envs))
+    o = _calc_outcomes_by_agent(rollout_length, done, reward, info)
+    success_by_env = o["success_rate"].reshape((num_agents, num_envs))
+    learnability_by_env = (success_by_env * (1 - success_by_env)).sum(axis=0)
+    return (
+        learnability_by_env,
+        o["num_episodes"].reshape(num_agents, num_envs).sum(axis=0),
+        o["num_success"].reshape(num_agents, num_envs).T,
+    )  # so agents is at the end.
+def compute_score(
+    config: dict, dones: chex.Array, values: chex.Array, max_returns: chex.Array, reward, info, advantages: chex.Array
+) -> chex.Array:
+    # Computes the score for each level
+    if config["score_function"] == "MaxMC":
+        return max_mc(dones, values, max_returns)
+    elif config["score_function"] == "pvl":
+        return positive_value_loss(dones, advantages)
+    elif config["score_function"] == "learnability":
+        learnability, num_episodes, num_success = compute_learnability(
+            config, dones, reward, info, config["num_train_envs"]
+        )
+        return learnability
+    else:
+        raise ValueError(f"Unknown score function: {config['score_function']}")
+def compute_all_scores(
+    config: dict,
+    dones: chex.Array,
+    values: chex.Array,
+    max_returns: chex.Array,
+    reward,
+    info,
+    advantages: chex.Array,
+    return_success_rate=False,
+):
+    mc = max_mc(dones, values, max_returns)
+    pvl = positive_value_loss(dones, advantages)
+    learnability, num_episodes, num_success = compute_learnability(
+        config, dones, reward, info, config["num_train_envs"]
+    )
+    if config["score_function"] == "MaxMC":
+        main_score = mc
+    elif config["score_function"] == "pvl":
+        main_score = pvl
+    elif config["score_function"] == "learnability":
+        main_score = learnability
+    else:
+        raise ValueError(f"Unknown score function: {config['score_function']}")
+    if return_success_rate:
+        success_rate = num_success.squeeze(1) / jnp.maximum(num_episodes, 1)
+        return main_score, (mc, pvl, learnability, success_rate)
+    return main_score, (mc, pvl, learnability)
+@hydra.main(version_base=None, config_path="../configs", config_name="plr")
+def main(config=None):
+    my_name = "PLR"
+    config = OmegaConf.to_container(config)
+    if config["ued"]["replay_prob"] == 0.0:
+        my_name = "DR"
+    elif config["ued"]["use_accel"]:
+        my_name = "ACCEL"
+    time_start = time.time()
+    config = normalise_config(config, my_name)
+    env_params, static_env_params = generate_params_from_config(config)
+    config["env_params"] = to_state_dict(env_params)
+    config["static_env_params"] = to_state_dict(static_env_params)
+    run = init_wandb(config, my_name)
+    config = wandb.config
+    time_prev = time.time()
+    def log_eval(stats, train_state_info):
+        nonlocal time_prev
+        print(f"Logging update: {stats['update_count']}")
+        total_loss = jnp.mean(stats["losses"][0])
+        if jnp.isnan(total_loss):
+            print("NaN loss, skipping logging")
+            raise ValueError("NaN loss")
+        # generic stats
+        env_steps = int(
+            int(stats["update_count"]) * config["num_train_envs"] * config["num_steps"] * config["outer_rollout_steps"]
+        )
+        env_steps_delta = (
+            config["eval_freq"] * config["num_train_envs"] * config["num_steps"] * config["outer_rollout_steps"]
+        )
+        time_now = time.time()
+        log_dict = {
+            "timing/num_updates": stats["update_count"],
+            "timing/num_env_steps": env_steps,
+            "timing/sps": env_steps_delta / (time_now - time_prev),
+            "timing/sps_agg": env_steps / (time_now - time_start),
+            "loss/total_loss": jnp.mean(stats["losses"][0]),
+            "loss/value_loss": jnp.mean(stats["losses"][1][0]),
+            "loss/policy_loss": jnp.mean(stats["losses"][1][1]),
+            "loss/entropy_loss": jnp.mean(stats["losses"][1][2]),
+        }
+        time_prev = time_now
+        # evaluation performance
+        returns = stats["eval_returns"]
+        log_dict.update({"eval/mean_eval_return": returns.mean()})
+        log_dict.update({"eval/mean_eval_learnability": stats["eval_learn"].mean()})
+        log_dict.update({"eval/mean_eval_solve_rate": stats["eval_solves"].mean()})
+        log_dict.update({"eval/mean_eval_eplen": stats["eval_ep_lengths"].mean()})
+        for i in range(config["num_eval_levels"]):
+            log_dict[f"eval_avg_return/{config['eval_levels'][i]}"] = returns[i]
+            log_dict[f"eval_avg_learnability/{config['eval_levels'][i]}"] = stats["eval_learn"][i]
+            log_dict[f"eval_avg_solve_rate/{config['eval_levels'][i]}"] = stats["eval_solves"][i]
+            log_dict[f"eval_avg_episode_length/{config['eval_levels'][i]}"] = stats["eval_ep_lengths"][i]
+            log_dict[f"eval_get_max_eplen/{config['eval_levels'][i]}"] = stats["eval_get_max_eplen"][i]
+            log_dict[f"episode_return_bigger_than_negative/{config['eval_levels'][i]}"] = stats[
+                "episode_return_bigger_than_negative"
+            ][i]
+        def _aggregate_per_size(values, name):
+            to_return = {}
+            for group_name, indices in eval_group_indices.items():
+                to_return[f"{name}_{group_name}"] = values[indices].mean()
+            return to_return
+        log_dict.update(_aggregate_per_size(returns, "eval_aggregate/return"))
+        log_dict.update(_aggregate_per_size(stats["eval_solves"], "eval_aggregate/solve_rate"))
+        if config["EVAL_ON_SAMPLED"]:
+            log_dict.update({"eval/mean_eval_return_sampled": stats["eval_dr_returns"].mean()})
+            log_dict.update({"eval/mean_eval_solve_rate_sampled": stats["eval_dr_solve_rates"].mean()})
+            log_dict.update({"eval/mean_eval_eplen_sampled": stats["eval_dr_eplen"].mean()})
+        # level sampler
+        log_dict.update(train_state_info["log"])
+        # images
+        log_dict.update(
+            {
+                "images/highest_scoring_level": wandb.Image(
+                    np.array(stats["highest_scoring_level"]), caption="Highest scoring level"
+                )
+            }
+        )
+        log_dict.update(
+            {
+                "images/highest_weighted_level": wandb.Image(
+                    np.array(stats["highest_weighted_level"]), caption="Highest weighted level"
+                )
+            }
+        )
+        for s in ["dr", "replay", "mutation"]:
+            if train_state_info["info"][f"num_{s}_updates"] > 0:
+                log_dict.update(
+                    {
+                        f"images/{s}_levels": [
+                            wandb.Image(np.array(image), caption=f"{score}")
+                            for image, score in zip(stats[f"{s}_levels"], stats[f"{s}_scores"])
+                        ]
+                    }
+                )
+                if stats["log_videos"]:
+                    # animations
+                    rollout_ep = stats[f"{s}_ep_len"]
+                    arr = np.array(stats[f"{s}_rollout"][:rollout_ep])
+                    log_dict.update(
+                        {
+                            f"media/{s}_eval": wandb.Video(
+                                arr.astype(np.uint8), fps=15, caption=f"{s.capitalize()} (len {rollout_ep})"
+                            )
+                        }
+                    )
+                #  * 255
+        # DR, Replay and Mutate Returns
+        dr_inds = (stats["update_state"] == UpdateState.DR).nonzero()[0]
+        rep_inds = (stats["update_state"] == UpdateState.REPLAY).nonzero()[0]
+        mut_inds = (stats["update_state"] == UpdateState.MUTATE).nonzero()[0]
+        for name, inds in [
+            ("DR", dr_inds),
+            ("REPLAY", rep_inds),
+            ("MUTATION", mut_inds),
+        ]:
+            if len(inds) > 0:
+                log_dict.update(
+                    {
+                        f"{name}/episode_return": stats["episode_return"][inds].mean(),
+                        f"{name}/mean_eplen": stats["returned_episode_lengths"][inds].mean(),
+                        f"{name}/mean_success": stats["returned_episode_solved"][inds].mean(),
+                        f"{name}/noop_return": stats["noop_returns"][inds].mean(),
+                        f"{name}/noop_eplen": stats["noop_eplen"][inds].mean(),
+                        f"{name}/noop_success": stats["noop_success"][inds].mean(),
+                        f"{name}/random_return": stats["random_returns"][inds].mean(),
+                        f"{name}/random_eplen": stats["random_eplen"][inds].mean(),
+                        f"{name}/random_success": stats["random_success"][inds].mean(),
+                    }
+                )
+                for k in stats:
+                    if "complexity/" in k:
+                        k2 = "complexity/" + name + "_" + k.replace("complexity/", "")
+                        log_dict.update({k2: stats[k][inds].mean()})
+                    if "ued_scores/" in k:
+                        k2 = "ued_scores/" + name + "_" + k.replace("ued_scores/", "")
+                        log_dict.update({k2: stats[k][inds].mean()})
+        # Eval rollout animations
+        if stats["log_videos"]:
+            for i in range((config["num_eval_levels"])):
+                frames, episode_length = stats["eval_animation"][0][:, i], stats["eval_animation"][1][i]
+                frames = np.array(frames[:episode_length])
+                log_dict.update(
+                    {
+                        f"media/eval_video_{config['eval_levels'][i]}": wandb.Video(
+                            frames.astype(np.uint8), fps=15, caption=f"Len ({episode_length})"
+                        )
+                    }
+                )
+        wandb.log(log_dict)
+    def get_all_metrics(
+        rng,
+        losses,
+        info,
+        init_env_state,
+        init_obs,
+        dones,
+        grads,
+        all_ued_scores,
+        new_levels,
+    ):
+        noop_returns, noop_len, noop_success, random_returns, random_lens, random_success = no_op_and_random_rollout(
+            env,
+            env_params,
+            rng,
+            init_obs,
+            init_env_state,
+            config["num_train_envs"],
+            config["num_steps"] * config["outer_rollout_steps"],
+        )
+        metrics = (
+            {
+                "losses": jax.tree_util.tree_map(lambda x: x.mean(), losses),
+                "returned_episode_lengths": (info["returned_episode_lengths"] * dones).sum()
+                / jnp.maximum(1, dones.sum()),
+                "max_episode_length": info["returned_episode_lengths"].max(),
+                "levels_played": init_env_state.env_state.env_state,
+                "episode_return": (info["returned_episode_returns"] * dones).sum() / jnp.maximum(1, dones.sum()),
+                "episode_return_v2": (info["returned_episode_returns"] * info["returned_episode"]).sum()
+                / jnp.maximum(1, info["returned_episode"].sum()),
+                "grad_norms": grads.mean(),
+                "noop_returns": noop_returns,
+                "noop_eplen": noop_len,
+                "noop_success": noop_success,
+                "random_returns": random_returns,
+                "random_eplen": random_lens,
+                "random_success": random_success,
+                "returned_episode_solved": (info["returned_episode_solved"] * dones).sum()
+                / jnp.maximum(1, dones.sum()),
+            }
+            | get_level_complexity_metrics(new_levels, static_env_params)
+            | get_ued_score_metrics(all_ued_scores)
+        )
+        return metrics
+    # Setup the environment.
+    def make_env(static_env_params):
+        env = make_kinetix_env_from_name(config["env_name"], static_env_params=static_env_params)
+        env = AutoReplayWrapper(env)
+        env = UnderspecifiedToGymnaxWrapper(env)
+        env = DenseRewardWrapper(env, dense_reward_scale=config["dense_reward_scale"])
+        env = LogWrapper(env)
+        return env
+    env = make_env(static_env_params)
+    if config["train_level_mode"] == "list":
+        sample_random_level = make_reset_train_function_with_list_of_levels(
+            config, config["train_levels_list"], static_env_params, make_pcg_state=False, is_loading_train_levels=True
+        )
+    elif config["train_level_mode"] == "random":
+        sample_random_level = make_reset_train_function_with_mutations(
+            env.physics_engine, env_params, static_env_params, config, make_pcg_state=False
+        )
+    else:
+        raise ValueError(f"Unknown train_level_mode: {config['train_level_mode']}")
+    if config["use_accel"] and config["accel_start_from_empty"]:
+        def make_sample_random_level():
+            def inner(rng):
+                def _inner_accel(rng):
+                    return create_random_starting_distribution(
+                        rng, env_params, static_env_params, ued_params, config["env_size_name"], controllable=True
+                    )
+                def _inner_accel_not_controllable(rng):
+                    return create_random_starting_distribution(
+                        rng, env_params, static_env_params, ued_params, config["env_size_name"], controllable=False
+                    )
+                rng, _rng = jax.random.split(rng)
+                return _inner_accel(_rng)
+            return inner
+        sample_random_level = make_sample_random_level()
+    sample_random_levels = make_vmapped_filtered_level_sampler(
+        sample_random_level, env_params, static_env_params, config, make_pcg_state=False, env=env
+    )
+    def generate_world():
+        raise NotImplementedError
+        pass
+    def generate_eval_world(rng, env_params, static_env_params, level_idx):
+        # jax.random.split(jax.random.PRNGKey(101), num_levels), env_params, static_env_params, jnp.arange(num_levels)
+        raise NotImplementedError
+    _, eval_static_env_params = generate_params_from_config(
+        config["eval_env_size_true"] | {"frame_skip": config["frame_skip"]}
+    )
+    eval_env = make_env(eval_static_env_params)
+    ued_params = generate_ued_params_from_config(config)
+    mutate_world = make_mutate_env(static_env_params, env_params, ued_params)
+    def make_render_fn(static_env_params):
+        render_fn_inner = make_render_pixels(env_params, static_env_params)
+        render_fn = lambda x: render_fn_inner(x).transpose(1, 0, 2)[::-1]
+        return render_fn
+    render_fn = make_render_fn(static_env_params)
+    render_fn_eval = make_render_fn(eval_static_env_params)
+    if config["EVAL_ON_SAMPLED"]:
+        NUM_EVAL_DR_LEVELS = 200
+        key_to_sample_dr_eval_set = jax.random.PRNGKey(100)
+        DR_EVAL_LEVELS = sample_random_levels(key_to_sample_dr_eval_set, NUM_EVAL_DR_LEVELS)
+    # And the level sampler
+    level_sampler = LevelSampler(
+        capacity=config["level_buffer_capacity"],
+        replay_prob=config["replay_prob"],
+        staleness_coeff=config["staleness_coeff"],
+        minimum_fill_ratio=config["minimum_fill_ratio"],
+        prioritization=config["prioritization"],
+        prioritization_params={"temperature": config["temperature"], "k": config["topk_k"]},
+        duplicate_check=config["buffer_duplicate_check"],
+    )
+    @jax.jit
+    def create_train_state(rng) -> TrainState:
+        # Creates the train state
+        def linear_schedule(count):
+            frac = 1.0 - (count // (config["num_minibatches"] * config["update_epochs"])) / (
+                config["num_updates"] * config["outer_rollout_steps"]
+            )
+            return config["lr"] * frac
+        rng, _rng = jax.random.split(rng)
+        init_state = jax.tree.map(lambda x: x[0], sample_random_levels(_rng, 1))
+        rng, _rng = jax.random.split(rng)
+        obs, _ = env.reset_to_level(_rng, init_state, env_params)
+        ns = config["num_steps"] * config["outer_rollout_steps"]
+        obs = jax.tree.map(
+            lambda x: jnp.repeat(jnp.repeat(x[None, ...], config["num_train_envs"], axis=0)[None, ...], ns, axis=0),
+            obs,
+        )
+        init_x = (obs, jnp.zeros((ns, config["num_train_envs"]), dtype=jnp.bool_))
+        network = make_network_from_config(env, env_params, config)
+        rng, _rng = jax.random.split(rng)
+        network_params = network.init(_rng, ScannedRNN.initialize_carry(config["num_train_envs"]), init_x)
+        if config["anneal_lr"]:
+            tx = optax.chain(
+                optax.clip_by_global_norm(config["max_grad_norm"]),
+                optax.adam(learning_rate=linear_schedule, eps=1e-5),
+            )
+        else:
+            tx = optax.chain(
+                optax.clip_by_global_norm(config["max_grad_norm"]),
+                optax.adam(config["lr"], eps=1e-5),
+            )
+        pholder_level = jax.tree.map(lambda x: x[0], sample_random_levels(jax.random.PRNGKey(0), 1))
+        sampler = level_sampler.initialize(pholder_level, {"max_return": -jnp.inf})
+        pholder_level_batch = jax.tree_util.tree_map(
+            lambda x: jnp.array([x]).repeat(config["num_train_envs"], axis=0), pholder_level
+        )
+        pholder_rollout_batch = (
+            jax.tree.map(
+                lambda x: jnp.repeat(
+                    jnp.expand_dims(x, 0), repeats=config["num_steps"] * config["outer_rollout_steps"], axis=0
+                ),
+                init_state,
+            ),
+            init_x[1][:, 0],
+        )
+        pholder_level_batch_scores = jnp.zeros((config["num_train_envs"],), dtype=jnp.float32)
+        train_state = TrainState.create(
+            apply_fn=network.apply,
+            params=network_params,
+            tx=tx,
+            sampler=sampler,
+            update_state=0,
+            num_dr_updates=0,
+            num_replay_updates=0,
+            num_mutation_updates=0,
+            dr_last_level_batch_scores=pholder_level_batch_scores,
+            replay_last_level_batch_scores=pholder_level_batch_scores,
+            mutation_last_level_batch_scores=pholder_level_batch_scores,
+            dr_last_level_batch=pholder_level_batch,
+            replay_last_level_batch=pholder_level_batch,
+            mutation_last_level_batch=pholder_level_batch,
+            dr_last_rollout_batch=pholder_rollout_batch,
+            replay_last_rollout_batch=pholder_rollout_batch,
+            mutation_last_rollout_batch=pholder_rollout_batch,
+        )
+        if config["load_from_checkpoint"] != None:
+            print("LOADING from", config["load_from_checkpoint"], "with only params =", config["load_only_params"])
+            train_state = load_train_state_from_wandb_artifact_path(
+                train_state,
+                config["load_from_checkpoint"],
+                load_only_params=config["load_only_params"],
+                legacy=config["load_legacy_checkpoint"],
+            )
+        return train_state
+    all_eval_levels = get_eval_levels(config["eval_levels"], eval_env.static_env_params)
+    eval_group_indices = get_eval_level_groups(config["eval_levels"])
+    @jax.jit
+    def train_step(carry: Tuple[chex.PRNGKey, TrainState], _):
+        """
+        This is the main training loop. It basically calls either `on_new_levels`, `on_replay_levels`, or `on_mutate_levels` at every step.
+        """
+        def on_new_levels(rng: chex.PRNGKey, train_state: TrainState):
+            """
+            Samples new (randomly-generated) levels and evaluates the policy on these. It also then adds the levels to the level buffer if they have high-enough scores.
+            The agent is updated on these trajectories iff `config["exploratory_grad_updates"]` is True.
+            """
+            sampler = train_state.sampler
+            # Reset
+            rng, rng_levels, rng_reset = jax.random.split(rng, 3)
+            new_levels = sample_random_levels(rng_levels, config["num_train_envs"])
+            init_obs, init_env_state = jax.vmap(env.reset_to_level, in_axes=(0, 0, None))(
+                jax.random.split(rng_reset, config["num_train_envs"]), new_levels, env_params
+            )
+            init_hstate = ScannedRNN.initialize_carry(config["num_train_envs"])
+            # Rollout
+            (
+                (rng, train_state, new_hstate, last_obs, last_env_state),
+                (
+                    obs,
+                    actions,
+                    rewards,
+                    dones,
+                    log_probs,
+                    values,
+                    info,
+                    advantages,
+                    targets,
+                    losses,
+                    grads,
+                    rollout_states,
+                ),
+            ) = sample_trajectories_and_learn(
+                env,
+                env_params,
+                config,
+                rng,
+                train_state,
+                init_hstate,
+                init_obs,
+                init_env_state,
+                update_grad=config["exploratory_grad_updates"],
+                return_states=True,
+            )
+            max_returns = compute_max_returns(dones, rewards)
+            scores, all_ued_scores = compute_all_scores(config, dones, values, max_returns, rewards, info, advantages)
+            sampler, _ = level_sampler.insert_batch(sampler, new_levels, scores, {"max_return": max_returns})
+            rng, _rng = jax.random.split(rng)
+            metrics = {
+                "update_state": UpdateState.DR,
+            } | get_all_metrics(_rng, losses, info, init_env_state, init_obs, dones, grads, all_ued_scores, new_levels)
+            train_state = train_state.replace(
+                sampler=sampler,
+                update_state=UpdateState.DR,
+                num_dr_updates=train_state.num_dr_updates + 1,
+                dr_last_level_batch=new_levels,
+                dr_last_level_batch_scores=scores,
+                dr_last_rollout_batch=jax.tree.map(
+                    lambda x: x[:, 0], (rollout_states.env_state.env_state.env_state, dones)
+                ),
+            )
+            return (rng, train_state), metrics
+        def on_replay_levels(rng: chex.PRNGKey, train_state: TrainState):
+            """
+            This samples levels from the level buffer, and updates the policy on them.
+            """
+            sampler = train_state.sampler
+            # Collect trajectories on replay levels
+            rng, rng_levels, rng_reset = jax.random.split(rng, 3)
+            sampler, (level_inds, levels) = level_sampler.sample_replay_levels(
+                sampler, rng_levels, config["num_train_envs"]
+            )
+            init_obs, init_env_state = jax.vmap(env.reset_to_level, in_axes=(0, 0, None))(
+                jax.random.split(rng_reset, config["num_train_envs"]), levels, env_params
+            )
+            init_hstate = ScannedRNN.initialize_carry(config["num_train_envs"])
+            (
+                (rng, train_state, new_hstate, last_obs, last_env_state),
+                (
+                    obs,
+                    actions,
+                    rewards,
+                    dones,
+                    log_probs,
+                    values,
+                    info,
+                    advantages,
+                    targets,
+                    losses,
+                    grads,
+                    rollout_states,
+                ),
+            ) = sample_trajectories_and_learn(
+                env,
+                env_params,
+                config,
+                rng,
+                train_state,
+                init_hstate,
+                init_obs,
+                init_env_state,
+                update_grad=True,
+                return_states=True,
+            )
+            max_returns = jnp.maximum(
+                level_sampler.get_levels_extra(sampler, level_inds)["max_return"], compute_max_returns(dones, rewards)
+            )
+            scores, all_ued_scores = compute_all_scores(config, dones, values, max_returns, rewards, info, advantages)
+            sampler = level_sampler.update_batch(sampler, level_inds, scores, {"max_return": max_returns})
+            rng, _rng = jax.random.split(rng)
+            metrics = {
+                "update_state": UpdateState.REPLAY,
+            } | get_all_metrics(_rng, losses, info, init_env_state, init_obs, dones, grads, all_ued_scores, levels)
+            train_state = train_state.replace(
+                sampler=sampler,
+                update_state=UpdateState.REPLAY,
+                num_replay_updates=train_state.num_replay_updates + 1,
+                replay_last_level_batch=levels,
+                replay_last_level_batch_scores=scores,
+                replay_last_rollout_batch=jax.tree.map(
+                    lambda x: x[:, 0], (rollout_states.env_state.env_state.env_state, dones)
+                ),
+            )
+            return (rng, train_state), metrics
+        def on_mutate_levels(rng: chex.PRNGKey, train_state: TrainState):
+            """
+            This mutates the previous batch of replay levels and potentially adds them to the level buffer.
+            This also updates the policy iff `config["exploratory_grad_updates"]` is True.
+            """
+            sampler = train_state.sampler
+            rng, rng_mutate, rng_reset = jax.random.split(rng, 3)
+            # mutate
+            parent_levels = train_state.replay_last_level_batch
+            child_levels = jax.vmap(mutate_world, (0, 0, None))(
+                jax.random.split(rng_mutate, config["num_train_envs"]), parent_levels, config["num_edits"]
+            )
+            init_obs, init_env_state = jax.vmap(env.reset_to_level, in_axes=(0, 0, None))(
+                jax.random.split(rng_reset, config["num_train_envs"]), child_levels, env_params
+            )
+            init_hstate = ScannedRNN.initialize_carry(config["num_train_envs"])
+            # rollout
+            (
+                (rng, train_state, new_hstate, last_obs, last_env_state),
+                (
+                    obs,
+                    actions,
+                    rewards,
+                    dones,
+                    log_probs,
+                    values,
+                    info,
+                    advantages,
+                    targets,
+                    losses,
+                    grads,
+                    rollout_states,
+                ),
+            ) = sample_trajectories_and_learn(
+                env,
+                env_params,
+                config,
+                rng,
+                train_state,
+                init_hstate,
+                init_obs,
+                init_env_state,
+                update_grad=config["exploratory_grad_updates"],
+                return_states=True,
+            )
+            max_returns = compute_max_returns(dones, rewards)
+            scores, all_ued_scores = compute_all_scores(config, dones, values, max_returns, rewards, info, advantages)
+            sampler, _ = level_sampler.insert_batch(sampler, child_levels, scores, {"max_return": max_returns})
+            rng, _rng = jax.random.split(rng)
+            metrics = {"update_state": UpdateState.MUTATE,} | get_all_metrics(
+                _rng, losses, info, init_env_state, init_obs, dones, grads, all_ued_scores, child_levels
+            )
+            train_state = train_state.replace(
+                sampler=sampler,
+                update_state=UpdateState.DR,
+                num_mutation_updates=train_state.num_mutation_updates + 1,
+                mutation_last_level_batch=child_levels,
+                mutation_last_level_batch_scores=scores,
+                mutation_last_rollout_batch=jax.tree.map(
+                    lambda x: x[:, 0], (rollout_states.env_state.env_state.env_state, dones)
+                ),
+            )
+            return (rng, train_state), metrics
+        rng, train_state = carry
+        rng, rng_replay = jax.random.split(rng)
+        # The train step makes a decision on which branch to take, either on_new, on_replay or on_mutate.
+        # on_mutate is only called if the replay branch has been taken before (as it uses `train_state.update_state`).
+        branches = [
+            on_new_levels,
+            on_replay_levels,
+        ]
+        if config["use_accel"]:
+            s = train_state.update_state
+            branch = (1 - s) * level_sampler.sample_replay_decision(train_state.sampler, rng_replay) + 2 * s
+            branches.append(on_mutate_levels)
+        else:
+            branch = level_sampler.sample_replay_decision(train_state.sampler, rng_replay).astype(int)
+        return jax.lax.switch(branch, branches, rng, train_state)
+    @partial(jax.jit, static_argnums=(2,))
+    def eval(rng: chex.PRNGKey, train_state: TrainState, keep_states=True):
+        """
+        This evaluates the current policy on the set of evaluation levels specified by config["eval_levels"].
+        It returns (states, cum_rewards, episode_lengths), with shapes (num_steps, num_eval_levels, ...), (num_eval_levels,), (num_eval_levels,)
+        """
+        num_levels = config["num_eval_levels"]
+        return general_eval(
+            rng,
+            eval_env,
+            env_params,
+            train_state,
+            all_eval_levels,
+            env_params.max_timesteps,
+            num_levels,
+            keep_states=keep_states,
+            return_trajectories=True,
+        )
+    @partial(jax.jit, static_argnums=(2,))
+    def eval_on_dr_levels(rng: chex.PRNGKey, train_state: TrainState, keep_states=False):
+        return general_eval(
+            rng,
+            env,
+            env_params,
+            train_state,
+            DR_EVAL_LEVELS,
+            env_params.max_timesteps,
+            NUM_EVAL_DR_LEVELS,
+            keep_states=keep_states,
+        )
+    @jax.jit
+    def train_and_eval_step(runner_state, _):
+        """
+        This function runs the train_step for a certain number of iterations, and then evaluates the policy.
+        It returns the updated train state, and a dictionary of metrics.
+        """
+        # Train
+        (rng, train_state), metrics = jax.lax.scan(train_step, runner_state, None, config["eval_freq"])
+        # Eval
+        metrics["update_count"] = (
+            train_state.num_dr_updates + train_state.num_replay_updates + train_state.num_mutation_updates
+        )
+        vid_frequency = get_video_frequency(config, metrics["update_count"])
+        should_log_videos = metrics["update_count"] % vid_frequency == 0
+        def _compute_eval_learnability(dones, rewards, infos):
+            @jax.vmap
+            def _single(d, r, i):
+                learn, num_eps, num_succ = compute_learnability(config, d, r, i, config["num_eval_levels"])
+                return num_eps, num_succ.squeeze(-1)
+            num_eps, num_succ = _single(dones, rewards, infos)
+            num_eps, num_succ = num_eps.sum(axis=0), num_succ.sum(axis=0)
+            success_rate = num_succ / jnp.maximum(1, num_eps)
+            return success_rate * (1 - success_rate)
+        @jax.jit
+        def _get_eval(rng):
+            metrics = {}
+            rng, rng_eval = jax.random.split(rng)
+            (states, cum_rewards, done_idx, episode_lengths, eval_infos), (eval_dones, eval_rewards) = jax.vmap(
+                eval, (0, None)
+            )(jax.random.split(rng_eval, config["eval_num_attempts"]), train_state)
+            # learnability here of the holdout set:
+            eval_learn = _compute_eval_learnability(eval_dones, eval_rewards, eval_infos)
+            # Collect Metrics
+            eval_returns = cum_rewards.mean(axis=0)  # (num_eval_levels,)
+            eval_solves = (eval_infos["returned_episode_solved"] * eval_dones).sum(axis=1) / jnp.maximum(
+                1, eval_dones.sum(axis=1)
+            )
+            eval_solves = eval_solves.mean(axis=0)
+            metrics["eval_returns"] = eval_returns
+            metrics["eval_ep_lengths"] = episode_lengths.mean(axis=0)
+            metrics["eval_learn"] = eval_learn
+            metrics["eval_solves"] = eval_solves
+            metrics["eval_get_max_eplen"] = (episode_lengths == env_params.max_timesteps).mean(axis=0)
+            metrics["episode_return_bigger_than_negative"] = (cum_rewards > -0.4).mean(axis=0)
+            if config["EVAL_ON_SAMPLED"]:
+                states_dr, cum_rewards_dr, done_idx_dr, episode_lengths_dr, infos_dr = jax.vmap(
+                    eval_on_dr_levels, (0, None)
+                )(jax.random.split(rng_eval, config["eval_num_attempts"]), train_state)
+                eval_dr_returns = cum_rewards_dr.mean(axis=0).mean()
+                eval_dr_eplen = episode_lengths_dr.mean(axis=0).mean()
+                my_eval_dones = infos_dr["returned_episode"]
+                eval_dr_solves = (infos_dr["returned_episode_solved"] * my_eval_dones).sum(axis=1) / jnp.maximum(
+                    1, my_eval_dones.sum(axis=1)
+                )
+                metrics["eval_dr_returns"] = eval_dr_returns
+                metrics["eval_dr_eplen"] = eval_dr_eplen
+                metrics["eval_dr_solve_rates"] = eval_dr_solves
+            return metrics, states, episode_lengths, cum_rewards
+        @jax.jit
+        def _get_videos(rng, states, episode_lengths, cum_rewards):
+            metrics = {"log_videos": True}
+            # just grab the first run
+            states, episode_lengths = jax.tree_util.tree_map(
+                lambda x: x[0], (states, episode_lengths)
+            )  # (num_steps, num_eval_levels, ...), (num_eval_levels,)
+            # And one attempt
+            states = jax.tree_util.tree_map(lambda x: x[:, :], states)
+            episode_lengths = episode_lengths[:]
+            images = jax.vmap(jax.vmap(render_fn_eval))(
+                states.env_state.env_state.env_state
+            )  # (num_steps, num_eval_levels, ...)
+            frames = images.transpose(
+                0, 1, 4, 2, 3
+            )  # WandB expects color channel before image dimensions when dealing with animations for some reason
+            @jax.jit
+            def _get_video(rollout_batch):
+                states = rollout_batch[0]
+                images = jax.vmap(render_fn)(states)  # dimensions are (steps, x, y, 3)
+                return (
+                    # jax.tree.map(lambda x: x[:].transpose(0, 2, 1, 3)[:, ::-1], images).transpose(0, 3, 1, 2),
+                    images.transpose(0, 3, 1, 2),
+                    # images.transpose(0, 1, 4, 2, 3),
+                    rollout_batch[1][:].argmax(),
+                )
+            # rollouts
+            metrics["dr_rollout"], metrics["dr_ep_len"] = _get_video(train_state.dr_last_rollout_batch)
+            metrics["replay_rollout"], metrics["replay_ep_len"] = _get_video(train_state.replay_last_rollout_batch)
+            metrics["mutation_rollout"], metrics["mutation_ep_len"] = _get_video(
+                train_state.mutation_last_rollout_batch
+            )
+            metrics["eval_animation"] = (frames, episode_lengths)
+            metrics["eval_returns_video"] = cum_rewards[0]
+            metrics["eval_len_video"] = episode_lengths
+            # Eval on sampled
+            return metrics
+        @jax.jit
+        def _get_dummy_videos(rng, states, episode_lengths, cum_rewards):
+            n_eval = config["num_eval_levels"]
+            nsteps = env_params.max_timesteps
+            nsteps2 = config["outer_rollout_steps"] * config["num_steps"]
+            img_size = (
+                env.static_env_params.screen_dim[0] // env.static_env_params.downscale,
+                env.static_env_params.screen_dim[1] // env.static_env_params.downscale,
+            )
+            return {
+                "log_videos": False,
+                "dr_rollout": jnp.zeros((nsteps2, 3, *img_size), jnp.float32),
+                "dr_ep_len": jnp.zeros((), jnp.int32),
+                "replay_rollout": jnp.zeros((nsteps2, 3, *img_size), jnp.float32),
+                "replay_ep_len": jnp.zeros((), jnp.int32),
+                "mutation_rollout": jnp.zeros((nsteps2, 3, *img_size), jnp.float32),
+                "mutation_ep_len": jnp.zeros((), jnp.int32),
+                # "eval_returns": jnp.zeros((n_eval,), jnp.float32),
+                # "eval_solves": jnp.zeros((n_eval,), jnp.float32),
+                # "eval_learn": jnp.zeros((n_eval,), jnp.float32),
+                # "eval_ep_lengths": jnp.zeros((n_eval,), jnp.int32),
+                "eval_animation": (
+                    jnp.zeros((nsteps, n_eval, 3, *img_size), jnp.float32),
+                    jnp.zeros((n_eval,), jnp.int32),
+                ),
+                "eval_returns_video": jnp.zeros((n_eval,), jnp.float32),
+                "eval_len_video": jnp.zeros((n_eval,), jnp.int32),
+            }
+        rng, rng_eval, rng_vid = jax.random.split(rng, 3)
+        metrics_eval, states, episode_lengths, cum_rewards = _get_eval(rng_eval)
+        metrics = {
+            **metrics,
+            **metrics_eval,
+            **jax.lax.cond(
+                should_log_videos, _get_videos, _get_dummy_videos, rng_vid, states, episode_lengths, cum_rewards
+            ),
+        }
+        max_num_images = 8
+        top_regret_ones = max_num_images // 2
+        bot_regret_ones = max_num_images - top_regret_ones
+        @jax.jit
+        def get_values(level_batch, scores):
+            args = jnp.argsort(scores)  # low scores are at the start, high scores are at the end
+            low_scores = args[:bot_regret_ones]
+            high_scores = args[-top_regret_ones:]
+            low_levels = jax.tree.map(lambda x: x[low_scores], level_batch)
+            high_levels = jax.tree.map(lambda x: x[high_scores], level_batch)
+            low_scores = scores[low_scores]
+            high_scores = scores[high_scores]
+            # now concatenate:
+            return jax.vmap(render_fn)(
+                jax.tree.map(lambda x, y: jnp.concatenate([x, y], axis=0), low_levels, high_levels)
+            ), jnp.concatenate([low_scores, high_scores], axis=0)
+        metrics["dr_levels"], metrics["dr_scores"] = get_values(
+            train_state.dr_last_level_batch, train_state.dr_last_level_batch_scores
+        )
+        metrics["replay_levels"], metrics["replay_scores"] = get_values(
+            train_state.replay_last_level_batch, train_state.replay_last_level_batch_scores
+        )
+        metrics["mutation_levels"], metrics["mutation_scores"] = get_values(
+            train_state.mutation_last_level_batch, train_state.mutation_last_level_batch_scores
+        )
+        def _t(i):
+            return jax.lax.select(i == 0, config["num_steps"], i)
+        metrics["dr_ep_len"] = _t(train_state.dr_last_rollout_batch[1][:].argmax())
+        metrics["replay_ep_len"] = _t(train_state.replay_last_rollout_batch[1][:].argmax())
+        metrics["mutation_ep_len"] = _t(train_state.mutation_last_rollout_batch[1][:].argmax())
+        highest_scoring_level = level_sampler.get_levels(train_state.sampler, train_state.sampler["scores"].argmax())
+        highest_weighted_level = level_sampler.get_levels(
+            train_state.sampler, level_sampler.level_weights(train_state.sampler).argmax()
+        )
+        metrics["highest_scoring_level"] = render_fn(highest_scoring_level)
+        metrics["highest_weighted_level"] = render_fn(highest_weighted_level)
+        # log_eval(metrics, train_state_to_log_dict(runner_state[1], level_sampler))
+        jax.debug.callback(log_eval, metrics, train_state_to_log_dict(runner_state[1], level_sampler))
+        return (rng, train_state), {"update_count": metrics["update_count"]}
+    def log_checkpoint(update_count, train_state):
+        if config["save_path"] is not None and config["checkpoint_save_freq"] > 1:
+            steps = (
+                int(update_count)
+                * int(config["num_train_envs"])
+                * int(config["num_steps"])
+                * int(config["outer_rollout_steps"])
+            )
+            # save_params_to_wandb(train_state.params, steps, config)
+            save_model_to_wandb(train_state, steps, config)
+    def train_eval_and_checkpoint_step(runner_state, _):
+        runner_state, metrics = jax.lax.scan(
+            train_and_eval_step, runner_state, xs=jnp.arange(config["checkpoint_save_freq"] // config["eval_freq"])
+        )
+        jax.debug.callback(log_checkpoint, metrics["update_count"][-1], runner_state[1])
+        return runner_state, metrics
+    # Set up the train states
+    rng = jax.random.PRNGKey(config["seed"])
+    rng_init, rng_train = jax.random.split(rng)
+    train_state = create_train_state(rng_init)
+    runner_state = (rng_train, train_state)
+    runner_state, metrics = jax.lax.scan(
+        train_eval_and_checkpoint_step,
+        runner_state,
+        xs=jnp.arange((config["num_updates"]) // (config["checkpoint_save_freq"])),
+    )
+    if config["save_path"] is not None:
+        # save_params_to_wandb(runner_state[1].params, config["total_timesteps"], config)
+        save_model_to_wandb(runner_state[1], config["total_timesteps"], config, is_final=True)
+    return runner_state[1]
+if __name__ == "__main__":
+    main()

Kinetix/experiments/ppo.py ADDED Viewed

	@@ -0,0 +1,468 @@

+import os
+import hydra
+from omegaconf import OmegaConf
+from kinetix.environment.ued.ued import (
+    make_reset_train_function_with_list_of_levels,
+    make_reset_train_function_with_mutations,
+)
+from kinetix.render.renderer_pixels import make_render_pixels
+from kinetix.util.config import (
+    get_video_frequency,
+    init_wandb,
+    normalise_config,
+    generate_params_from_config,
+)
+os.environ["WANDB_DISABLE_SERVICE"] = "True"
+import sys
+from typing import Any, NamedTuple
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+from flax.training.train_state import TrainState
+from kinetix.models import make_network_from_config
+from kinetix.util.learning import general_eval, get_eval_levels
+from flax.serialization import to_state_dict
+import wandb
+from kinetix.environment.env import PixelObservations, make_kinetix_env_from_name
+from kinetix.environment.wrappers import (
+    AutoReplayWrapper,
+    AutoResetWrapper,
+    BatchEnvWrapper,
+    DenseRewardWrapper,
+    LogWrapper,
+    UnderspecifiedToGymnaxWrapper,
+)
+from kinetix.models.actor_critic import ScannedRNN
+from kinetix.util.saving import (
+    load_train_state_from_wandb_artifact_path,
+    save_model_to_wandb,
+)
+class Transition(NamedTuple):
+    done: jnp.ndarray
+    action: jnp.ndarray
+    value: jnp.ndarray
+    reward: jnp.ndarray
+    log_prob: jnp.ndarray
+    obs: Any
+    info: jnp.ndarray
+def make_train(config, env_params, static_env_params):
+    config["num_updates"] = config["total_timesteps"] // config["num_steps"] // config["num_train_envs"]
+    config["minibatch_size"] = config["num_train_envs"] * config["num_steps"] // config["num_minibatches"]
+    env = make_kinetix_env_from_name(config["env_name"], static_env_params=static_env_params)
+    if config["train_level_mode"] == "list":
+        reset_func = make_reset_train_function_with_list_of_levels(
+            config, config["train_levels_list"], static_env_params, is_loading_train_levels=True
+        )
+    elif config["train_level_mode"] == "random":
+        reset_func = make_reset_train_function_with_mutations(
+            env.physics_engine, env_params, env.static_env_params, config
+        )
+    else:
+        raise ValueError(f"Unknown train_level_mode: {config['train_level_mode']}")
+    env = UnderspecifiedToGymnaxWrapper(AutoResetWrapper(env, reset_func))
+    eval_env = make_kinetix_env_from_name(config["env_name"], static_env_params=static_env_params)
+    eval_env = UnderspecifiedToGymnaxWrapper(AutoReplayWrapper(eval_env))
+    env = DenseRewardWrapper(env)
+    env = LogWrapper(env)
+    env = BatchEnvWrapper(env, num_envs=config["num_train_envs"])
+    eval_env_nonbatch = LogWrapper(DenseRewardWrapper(eval_env))
+    def linear_schedule(count):
+        frac = 1.0 - (count // (config["num_minibatches"] * config["update_epochs"])) / config["num_updates"]
+        return config["lr"] * frac
+    def linear_warmup_cosine_decay_schedule(count):
+        frac = (count // (config["num_minibatches"] * config["update_epochs"])) / config[
+            "num_updates"
+        ]  # between 0 and 1
+        delta = config["peak_lr"] - config["initial_lr"]
+        frac_diff_max = 1.0 - config["warmup_frac"]
+        frac_cosine = (frac - config["warmup_frac"]) / frac_diff_max
+        return jax.lax.select(
+            frac < config["warmup_frac"],
+            config["initial_lr"] + delta * frac / config["warmup_frac"],
+            config["peak_lr"] * jnp.maximum(0.0, 0.5 * (1.0 + jnp.cos(jnp.pi * ((frac_cosine) % 1.0)))),
+        )
+    def train(rng):
+        # INIT NETWORK
+        network = make_network_from_config(env, env_params, config)
+        rng, _rng = jax.random.split(rng)
+        obsv, env_state = env.reset(_rng, env_params)
+        dones = jnp.zeros((config["num_train_envs"]), dtype=jnp.bool_)
+        rng, _rng = jax.random.split(rng)
+        init_hstate = ScannedRNN.initialize_carry(config["num_train_envs"])
+        init_x = jax.tree.map(lambda x: x[None, ...], (obsv, dones))
+        network_params = network.init(_rng, init_hstate, init_x)
+        param_count = sum(x.size for x in jax.tree_util.tree_leaves(network_params))
+        obs_size = sum(x.size for x in jax.tree_util.tree_leaves(obsv)) // config["num_train_envs"]
+        print("Number of parameters", param_count, "size of obs: ", obs_size)
+        if config["anneal_lr"]:
+            tx = optax.chain(
+                optax.clip_by_global_norm(config["max_grad_norm"]),
+                optax.adam(learning_rate=linear_schedule, eps=1e-5),
+            )
+        elif config["warmup_lr"]:
+            tx = optax.chain(
+                optax.clip_by_global_norm(config["max_grad_norm"]),
+                optax.adamw(learning_rate=linear_warmup_cosine_decay_schedule, eps=1e-5),
+            )
+        else:
+            tx = optax.chain(
+                optax.clip_by_global_norm(config["max_grad_norm"]),
+                optax.adam(config["lr"], eps=1e-5),
+            )
+        train_state = TrainState.create(
+            apply_fn=network.apply,
+            params=network_params,
+            tx=tx,
+        )
+        if config["load_from_checkpoint"] != None:
+            print("LOADING from", config["load_from_checkpoint"], "with only params =", config["load_only_params"])
+            train_state = load_train_state_from_wandb_artifact_path(
+                train_state, config["load_from_checkpoint"], load_only_params=config["load_only_params"]
+            )
+        # INIT ENV
+        rng, _rng = jax.random.split(rng)
+        obsv, env_state = env.reset(_rng, env_params)
+        init_hstate = ScannedRNN.initialize_carry(config["num_train_envs"])
+        render_static_env_params = env.static_env_params.replace(downscale=1)
+        pixel_renderer = jax.jit(make_render_pixels(env_params, render_static_env_params))
+        pixel_render_fn = lambda x: pixel_renderer(x) / 255.0
+        eval_levels = get_eval_levels(config["eval_levels"], env.static_env_params)
+        def _vmapped_eval_step(runner_state, rng):
+            def _single_eval_step(rng):
+                return general_eval(
+                    rng,
+                    eval_env_nonbatch,
+                    env_params,
+                    runner_state[0],
+                    eval_levels,
+                    env_params.max_timesteps,
+                    config["num_eval_levels"],
+                    keep_states=True,
+                    return_trajectories=True,
+                )
+            (states, returns, done_idxs, episode_lengths, eval_infos), (eval_dones, eval_rewards) = jax.vmap(
+                _single_eval_step
+            )(jax.random.split(rng, config["eval_num_attempts"]))
+            eval_solves = (eval_infos["returned_episode_solved"] * eval_dones).sum(axis=1) / jnp.maximum(
+                1, eval_dones.sum(axis=1)
+            )
+            states_to_plot = jax.tree.map(lambda x: x[0], states)
+            # obs = jax.vmap(jax.vmap(pixel_render_fn))(states_to_plot.env_state.env_state.env_state)
+            return (
+                states_to_plot,
+                done_idxs[0],
+                returns[0],
+                returns.mean(axis=0),
+                episode_lengths.mean(axis=0),
+                eval_solves.mean(axis=0),
+            )
+        # TRAIN LOOP
+        def _update_step(runner_state, unused):
+            # COLLECT TRAJECTORIES
+            def _env_step(runner_state, unused):
+                (
+                    train_state,
+                    env_state,
+                    last_obs,
+                    last_done,
+                    hstate,
+                    rng,
+                    update_step,
+                ) = runner_state
+                # SELECT ACTION
+                rng, _rng = jax.random.split(rng)
+                ac_in = (jax.tree.map(lambda x: x[np.newaxis, :], last_obs), last_done[np.newaxis, :])
+                hstate, pi, value = network.apply(train_state.params, hstate, ac_in)
+                action = pi.sample(seed=_rng)
+                log_prob = pi.log_prob(action)
+                value, action, log_prob = (
+                    value.squeeze(0),
+                    action.squeeze(0),
+                    log_prob.squeeze(0),
+                )
+                # STEP ENV
+                rng, _rng = jax.random.split(rng)
+                obsv, env_state, reward, done, info = env.step(_rng, env_state, action, env_params)
+                transition = Transition(last_done, action, value, reward, log_prob, last_obs, info)
+                runner_state = (
+                    train_state,
+                    env_state,
+                    obsv,
+                    done,
+                    hstate,
+                    rng,
+                    update_step,
+                )
+                return runner_state, transition
+            initial_hstate = runner_state[-3]
+            runner_state, traj_batch = jax.lax.scan(_env_step, runner_state, None, config["num_steps"])
+            # CALCULATE ADVANTAGE
+            (
+                train_state,
+                env_state,
+                last_obs,
+                last_done,
+                hstate,
+                rng,
+                update_step,
+            ) = runner_state
+            ac_in = (jax.tree.map(lambda x: x[np.newaxis, :], last_obs), last_done[np.newaxis, :])
+            _, _, last_val = network.apply(train_state.params, hstate, ac_in)
+            last_val = last_val.squeeze(0)
+            def _calculate_gae(traj_batch, last_val, last_done):
+                def _get_advantages(carry, transition):
+                    gae, next_value, next_done = carry
+                    done, value, reward = (
+                        transition.done,
+                        transition.value,
+                        transition.reward,
+                    )
+                    delta = reward + config["gamma"] * next_value * (1 - next_done) - value
+                    gae = delta + config["gamma"] * config["gae_lambda"] * (1 - next_done) * gae
+                    return (gae, value, done), gae
+                _, advantages = jax.lax.scan(
+                    _get_advantages,
+                    (jnp.zeros_like(last_val), last_val, last_done),
+                    traj_batch,
+                    reverse=True,
+                    unroll=16,
+                )
+                return advantages, advantages + traj_batch.value
+            advantages, targets = _calculate_gae(traj_batch, last_val, last_done)
+            # UPDATE NETWORK
+            def _update_epoch(update_state, unused):
+                def _update_minbatch(train_state, batch_info):
+                    init_hstate, traj_batch, advantages, targets = batch_info
+                    def _loss_fn(params, init_hstate, traj_batch, gae, targets):
+                        # RERUN NETWORK
+                        _, pi, value = network.apply(params, init_hstate[0], (traj_batch.obs, traj_batch.done))
+                        log_prob = pi.log_prob(traj_batch.action)
+                        # CALCULATE VALUE LOSS
+                        value_pred_clipped = traj_batch.value + (value - traj_batch.value).clip(
+                            -config["clip_eps"], config["clip_eps"]
+                        )
+                        value_losses = jnp.square(value - targets)
+                        value_losses_clipped = jnp.square(value_pred_clipped - targets)
+                        value_loss = 0.5 * jnp.maximum(value_losses, value_losses_clipped).mean()
+                        # CALCULATE ACTOR LOSS
+                        ratio = jnp.exp(log_prob - traj_batch.log_prob)
+                        gae = (gae - gae.mean()) / (gae.std() + 1e-8)
+                        loss_actor1 = ratio * gae
+                        loss_actor2 = (
+                            jnp.clip(
+                                ratio,
+                                1.0 - config["clip_eps"],
+                                1.0 + config["clip_eps"],
+                            )
+                            * gae
+                        )
+                        loss_actor = -jnp.minimum(loss_actor1, loss_actor2)
+                        loss_actor = loss_actor.mean()
+                        entropy = pi.entropy().mean()
+                        total_loss = loss_actor + config["vf_coef"] * value_loss - config["ent_coef"] * entropy
+                        return total_loss, (value_loss, loss_actor, entropy)
+                    grad_fn = jax.value_and_grad(_loss_fn, has_aux=True)
+                    total_loss, grads = grad_fn(train_state.params, init_hstate, traj_batch, advantages, targets)
+                    train_state = train_state.apply_gradients(grads=grads)
+                    return train_state, total_loss
+                (
+                    train_state,
+                    init_hstate,
+                    traj_batch,
+                    advantages,
+                    targets,
+                    rng,
+                ) = update_state
+                rng, _rng = jax.random.split(rng)
+                permutation = jax.random.permutation(_rng, config["num_train_envs"])
+                batch = (init_hstate, traj_batch, advantages, targets)
+                shuffled_batch = jax.tree_util.tree_map(lambda x: jnp.take(x, permutation, axis=1), batch)
+                minibatches = jax.tree_util.tree_map(
+                    lambda x: jnp.swapaxes(
+                        jnp.reshape(
+                            x,
+                            [x.shape[0], config["num_minibatches"], -1] + list(x.shape[2:]),
+                        ),
+                        1,
+                        0,
+                    ),
+                    shuffled_batch,
+                )
+                train_state, total_loss = jax.lax.scan(_update_minbatch, train_state, minibatches)
+                update_state = (
+                    train_state,
+                    init_hstate,
+                    traj_batch,
+                    advantages,
+                    targets,
+                    rng,
+                )
+                return update_state, total_loss
+            init_hstate = initial_hstate[None, :]  # TBH
+            update_state = (
+                train_state,
+                init_hstate,
+                traj_batch,
+                advantages,
+                targets,
+                rng,
+            )
+            update_state, loss_info = jax.lax.scan(_update_epoch, update_state, None, config["update_epochs"])
+            train_state = update_state[0]
+            metric = jax.tree.map(
+                lambda x: (x * traj_batch.info["returned_episode"]).sum() / traj_batch.info["returned_episode"].sum(),
+                traj_batch.info,
+            )
+            rng = update_state[-1]
+            if config["use_wandb"]:
+                vid_frequency = get_video_frequency(config, update_step)
+                rng, _rng = jax.random.split(rng)
+                to_log_videos = _vmapped_eval_step(runner_state, _rng)
+                should_log_videos = update_step % vid_frequency == 0
+                first = jax.lax.cond(
+                    should_log_videos,
+                    lambda: jax.vmap(jax.vmap(pixel_render_fn))(to_log_videos[0].env_state.env_state.env_state),
+                    lambda: (
+                        jnp.zeros(
+                            (
+                                env_params.max_timesteps,
+                                config["num_eval_levels"],
+                                *PixelObservations(env_params, render_static_env_params)
+                                .observation_space(env_params)
+                                .shape,
+                            )
+                        )
+                    ),
+                )
+                to_log_videos = (first, should_log_videos, *to_log_videos[1:])
+                def callback(metric, raw_info, loss_info, update_step, to_log_videos):
+                    to_log = {}
+                    to_log["timing/num_updates"] = update_step
+                    to_log["timing/num_env_steps"] = update_step * config["num_steps"] * config["num_train_envs"]
+                    (
+                        obs_vid,
+                        should_log_videos,
+                        idx_vid,
+                        eval_return_vid,
+                        eval_return_mean,
+                        eval_eplen_mean,
+                        eval_solverate_mean,
+                    ) = to_log_videos
+                    to_log["eval/mean_eval_return"] = eval_return_mean.mean()
+                    to_log["eval/mean_eval_eplen"] = eval_eplen_mean.mean()
+                    for i, eval_name in enumerate(config["eval_levels"]):
+                        return_on_video = eval_return_vid[i]
+                        to_log[f"eval_video/return_{eval_name}"] = return_on_video
+                        to_log[f"eval_video/len_{eval_name}"] = idx_vid[i]
+                        to_log[f"eval_avg/return_{eval_name}"] = eval_return_mean[i]
+                        to_log[f"eval_avg/solve_rate_{eval_name}"] = eval_solverate_mean[i]
+                    if should_log_videos:
+                        for i, eval_name in enumerate(config["eval_levels"]):
+                            obs_to_use = obs_vid[: idx_vid[i], i]
+                            obs_to_use = np.asarray(obs_to_use).transpose(0, 3, 2, 1)[:, :, ::-1, :]
+                            to_log[f"media/eval_video_{eval_name}"] = wandb.Video((obs_to_use * 255).astype(np.uint8))
+                    wandb.log(to_log)
+                jax.debug.callback(callback, metric, traj_batch.info, loss_info, update_step, to_log_videos)
+            runner_state = (
+                train_state,
+                env_state,
+                last_obs,
+                last_done,
+                hstate,
+                rng,
+                update_step + 1,
+            )
+            return runner_state, metric
+        rng, _rng = jax.random.split(rng)
+        runner_state = (
+            train_state,
+            env_state,
+            obsv,
+            jnp.zeros((config["num_train_envs"]), dtype=bool),
+            init_hstate,
+            _rng,
+            0,
+        )
+        runner_state, metric = jax.lax.scan(_update_step, runner_state, None, config["num_updates"])
+        return {"runner_state": runner_state, "metric": metric}
+    return train
+@hydra.main(version_base=None, config_path="../configs", config_name="ppo")
+def main(config):
+    config = normalise_config(OmegaConf.to_container(config), "PPO")
+    env_params, static_env_params = generate_params_from_config(config)
+    config["env_params"] = to_state_dict(env_params)
+    config["static_env_params"] = to_state_dict(static_env_params)
+    if config["use_wandb"]:
+        run = init_wandb(config, "PPO")
+    rng = jax.random.PRNGKey(config["seed"])
+    rng, _rng = jax.random.split(rng)
+    train_jit = jax.jit(make_train(config, env_params, static_env_params))
+    out = train_jit(_rng)
+    if config["use_wandb"]:
+        if config["save_policy"]:
+            train_state = jax.tree.map(lambda x: x, out["runner_state"][0])
+            save_model_to_wandb(train_state, config["total_timesteps"], config)
+if __name__ == "__main__":
+    main()

Kinetix/experiments/sfl.py ADDED Viewed

	@@ -0,0 +1,1067 @@

+"""
+Based on PureJaxRL Implementation of PPO
+"""
+import os
+import sys
+import time
+import typing
+from functools import partial
+from typing import NamedTuple
+import chex
+import hydra
+import jax
+import jax.experimental
+import jax.numpy as jnp
+import matplotlib.pyplot as plt
+import numpy as np
+import optax
+from flax.training.train_state import TrainState
+from kinetix.environment.ued.ued import make_reset_train_function_with_mutations, make_vmapped_filtered_level_sampler
+from kinetix.environment.ued.ued import (
+    make_reset_train_function_with_list_of_levels,
+    make_reset_train_function_with_mutations,
+)
+from kinetix.util.config import (
+    generate_ued_params_from_config,
+    init_wandb,
+    normalise_config,
+    generate_params_from_config,
+    get_eval_level_groups,
+)
+from jaxued.environments.underspecified_env import EnvParams, EnvState, Observation, UnderspecifiedEnv
+from omegaconf import OmegaConf
+from PIL import Image
+from flax.serialization import to_state_dict
+import wandb
+from kinetix.environment.env import make_kinetix_env_from_name
+from kinetix.environment.wrappers import (
+    AutoReplayWrapper,
+    DenseRewardWrapper,
+    LogWrapper,
+    UnderspecifiedToGymnaxWrapper,
+)
+from kinetix.models import make_network_from_config
+from kinetix.models.actor_critic import ScannedRNN
+from kinetix.render.renderer_pixels import make_render_pixels
+from kinetix.util.learning import general_eval, get_eval_levels
+from kinetix.util.saving import (
+    load_train_state_from_wandb_artifact_path,
+    save_model_to_wandb,
+)
+sys.path.append("ued")
+from flax.traverse_util import flatten_dict, unflatten_dict
+from safetensors.flax import load_file, save_file
+def save_params(params: typing.Dict, filename: typing.Union[str, os.PathLike]) -> None:
+    flattened_dict = flatten_dict(params, sep=",")
+    save_file(flattened_dict, filename)
+def load_params(filename: typing.Union[str, os.PathLike]) -> typing.Dict:
+    flattened_dict = load_file(filename)
+    return unflatten_dict(flattened_dict, sep=",")
+class Transition(NamedTuple):
+    global_done: jnp.ndarray
+    done: jnp.ndarray
+    action: jnp.ndarray
+    value: jnp.ndarray
+    reward: jnp.ndarray
+    log_prob: jnp.ndarray
+    obs: jnp.ndarray
+    info: jnp.ndarray
+class RolloutBatch(NamedTuple):
+    obs: jnp.ndarray
+    actions: jnp.ndarray
+    rewards: jnp.ndarray
+    dones: jnp.ndarray
+    log_probs: jnp.ndarray
+    values: jnp.ndarray
+    targets: jnp.ndarray
+    advantages: jnp.ndarray
+    # carry: jnp.ndarray
+    mask: jnp.ndarray
+def evaluate_rnn(
+    rng: chex.PRNGKey,
+    env: UnderspecifiedEnv,
+    env_params: EnvParams,
+    train_state: TrainState,
+    init_hstate: chex.ArrayTree,
+    init_obs: Observation,
+    init_env_state: EnvState,
+    max_episode_length: int,
+    keep_states=True,
+) -> tuple[chex.Array, chex.Array, chex.Array]:
+    """This runs the RNN on the environment, given an initial state and observation, and returns (states, rewards, episode_lengths)
+    Args:
+        rng (chex.PRNGKey):
+        env (UnderspecifiedEnv):
+        env_params (EnvParams):
+        train_state (TrainState):
+        init_hstate (chex.ArrayTree): Shape (num_levels, )
+        init_obs (Observation): Shape (num_levels, )
+        init_env_state (EnvState): Shape (num_levels, )
+        max_episode_length (int):
+    Returns:
+        Tuple[chex.Array, chex.Array, chex.Array]: (States, rewards, episode lengths) ((NUM_STEPS, NUM_LEVELS), (NUM_STEPS, NUM_LEVELS), (NUM_LEVELS,)
+    """
+    num_levels = jax.tree_util.tree_flatten(init_obs)[0][0].shape[0]
+    def step(carry, _):
+        rng, hstate, obs, state, done, mask, episode_length = carry
+        rng, rng_action, rng_step = jax.random.split(rng, 3)
+        x = jax.tree.map(lambda x: x[None, ...], (obs, done))
+        hstate, pi, _ = train_state.apply_fn(train_state.params, hstate, x)
+        action = pi.sample(seed=rng_action).squeeze(0)
+        obs, next_state, reward, done, info = jax.vmap(env.step, in_axes=(0, 0, 0, None))(
+            jax.random.split(rng_step, num_levels), state, action, env_params
+        )
+        next_mask = mask & ~done
+        episode_length += mask
+        if keep_states:
+            return (rng, hstate, obs, next_state, done, next_mask, episode_length), (state, reward, info)
+        else:
+            return (rng, hstate, obs, next_state, done, next_mask, episode_length), (None, reward, info)
+    (_, _, _, _, _, _, episode_lengths), (states, rewards, infos) = jax.lax.scan(
+        step,
+        (
+            rng,
+            init_hstate,
+            init_obs,
+            init_env_state,
+            jnp.zeros(num_levels, dtype=bool),
+            jnp.ones(num_levels, dtype=bool),
+            jnp.zeros(num_levels, dtype=jnp.int32),
+        ),
+        None,
+        length=max_episode_length,
+    )
+    return states, rewards, episode_lengths, infos
+@hydra.main(version_base=None, config_path="../configs", config_name="sfl")
+def main(config):
+    time_start = time.time()
+    config = OmegaConf.to_container(config)
+    config = normalise_config(config, "SFL" if config["ued"]["sampled_envs_ratio"] > 0 else "SFL-DR")
+    env_params, static_env_params = generate_params_from_config(config)
+    config["env_params"] = to_state_dict(env_params)
+    config["static_env_params"] = to_state_dict(static_env_params)
+    run = init_wandb(config, "SFL")
+    rng = jax.random.PRNGKey(config["seed"])
+    config["num_envs_from_sampled"] = int(config["num_train_envs"] * config["sampled_envs_ratio"])
+    config["num_envs_to_generate"] = int(config["num_train_envs"] * (1 - config["sampled_envs_ratio"]))
+    assert (config["num_envs_from_sampled"] + config["num_envs_to_generate"]) == config["num_train_envs"]
+    def make_env(static_env_params):
+        env = make_kinetix_env_from_name(config["env_name"], static_env_params=static_env_params)
+        env = AutoReplayWrapper(env)
+        env = UnderspecifiedToGymnaxWrapper(env)
+        env = DenseRewardWrapper(env, dense_reward_scale=config["dense_reward_scale"])
+        env = LogWrapper(env)
+        return env
+    env = make_env(static_env_params)
+    if config["train_level_mode"] == "list":
+        sample_random_level = make_reset_train_function_with_list_of_levels(
+            config, config["train_levels"], static_env_params, make_pcg_state=False, is_loading_train_levels=True
+        )
+    elif config["train_level_mode"] == "random":
+        sample_random_level = make_reset_train_function_with_mutations(
+            env.physics_engine, env_params, static_env_params, config, make_pcg_state=False
+        )
+    else:
+        raise ValueError(f"Unknown train_level_mode: {config['train_level_mode']}")
+    sample_random_levels = make_vmapped_filtered_level_sampler(
+        sample_random_level, env_params, static_env_params, config, make_pcg_state=False, env=env
+    )
+    _, eval_static_env_params = generate_params_from_config(
+        config["eval_env_size_true"] | {"frame_skip": config["frame_skip"]}
+    )
+    eval_env = make_env(eval_static_env_params)
+    ued_params = generate_ued_params_from_config(config)
+    def make_render_fn(static_env_params):
+        render_fn_inner = make_render_pixels(env_params, static_env_params)
+        render_fn = lambda x: render_fn_inner(x).transpose(1, 0, 2)[::-1]
+        return render_fn
+    render_fn = make_render_fn(static_env_params)
+    render_fn_eval = make_render_fn(eval_static_env_params)
+    NUM_EVAL_DR_LEVELS = 200
+    key_to_sample_dr_eval_set = jax.random.PRNGKey(100)
+    DR_EVAL_LEVELS = sample_random_levels(key_to_sample_dr_eval_set, NUM_EVAL_DR_LEVELS)
+    print("Hello here num steps is ", config["num_steps"])
+    print("CONFIG is ", config)
+    config["total_timesteps"] = config["num_updates"] * config["num_steps"] * config["num_train_envs"]
+    config["minibatch_size"] = config["num_train_envs"] * config["num_steps"] // config["num_minibatches"]
+    config["clip_eps"] = config["clip_eps"]
+    config["env_name"] = config["env_name"]
+    network = make_network_from_config(env, env_params, config)
+    def linear_schedule(count):
+        count = count // (config["num_minibatches"] * config["update_epochs"])
+        frac = 1.0 - count / config["num_updates"]
+        return config["lr"] * frac
+    # INIT NETWORK
+    rng, _rng = jax.random.split(rng)
+    train_envs = 32  # To not run out of memory, the initial sample size does not matter.
+    obs, _ = env.reset_to_level(rng, sample_random_level(rng), env_params)
+    obs = jax.tree.map(
+        lambda x: jnp.repeat(jnp.repeat(x[None, ...], train_envs, axis=0)[None, ...], 256, axis=0),
+        obs,
+    )
+    init_x = (obs, jnp.zeros((256, train_envs)))
+    init_hstate = ScannedRNN.initialize_carry(train_envs)
+    network_params = network.init(_rng, init_hstate, init_x)
+    if config["anneal_lr"]:
+        tx = optax.chain(
+            optax.clip_by_global_norm(config["max_grad_norm"]),
+            optax.adam(learning_rate=linear_schedule, eps=1e-5),
+        )
+    else:
+        tx = optax.chain(
+            optax.clip_by_global_norm(config["max_grad_norm"]),
+            optax.adam(config["lr"], eps=1e-5),
+        )
+    train_state = TrainState.create(
+        apply_fn=network.apply,
+        params=network_params,
+        tx=tx,
+    )
+    if config["load_from_checkpoint"] != None:
+        print("LOADING from", config["load_from_checkpoint"], "with only params =", config["load_only_params"])
+        train_state = load_train_state_from_wandb_artifact_path(
+            train_state,
+            config["load_from_checkpoint"],
+            load_only_params=config["load_only_params"],
+            legacy=config["load_legacy_checkpoint"],
+        )
+    rng, _rng = jax.random.split(rng)
+    # INIT ENV
+    rng, _rng, _rng2 = jax.random.split(rng, 3)
+    rng_reset = jax.random.split(_rng, config["num_train_envs"])
+    new_levels = sample_random_levels(_rng2, config["num_train_envs"])
+    obsv, env_state = jax.vmap(env.reset_to_level, in_axes=(0, 0, None))(rng_reset, new_levels, env_params)
+    start_state = env_state
+    init_hstate = ScannedRNN.initialize_carry(config["num_train_envs"])
+    @jax.jit
+    def log_buffer_learnability(rng, train_state, instances):
+        BATCH_SIZE = config["num_to_save"]
+        BATCH_ACTORS = BATCH_SIZE
+        def _batch_step(unused, rng):
+            def _env_step(runner_state, unused):
+                env_state, start_state, last_obs, last_done, hstate, rng = runner_state
+                # SELECT ACTION
+                rng, _rng = jax.random.split(rng)
+                obs_batch = last_obs
+                ac_in = (
+                    jax.tree.map(lambda x: x[np.newaxis, :], obs_batch),
+                    last_done[np.newaxis, :],
+                )
+                hstate, pi, value = network.apply(train_state.params, hstate, ac_in)
+                action = pi.sample(seed=_rng).squeeze()
+                log_prob = pi.log_prob(action)
+                env_act = action
+                # STEP ENV
+                rng, _rng = jax.random.split(rng)
+                rng_step = jax.random.split(_rng, config["num_to_save"])
+                obsv, env_state, reward, done, info = jax.vmap(env.step, in_axes=(0, 0, 0, None))(
+                    rng_step, env_state, env_act, env_params
+                )
+                done_batch = done
+                transition = Transition(
+                    done,
+                    last_done,
+                    action.squeeze(),
+                    value.squeeze(),
+                    reward,
+                    log_prob.squeeze(),
+                    obs_batch,
+                    info,
+                )
+                runner_state = (env_state, start_state, obsv, done_batch, hstate, rng)
+                return runner_state, transition
+            @partial(jax.vmap, in_axes=(None, 1, 1, 1))
+            @partial(jax.jit, static_argnums=(0,))
+            def _calc_outcomes_by_agent(max_steps: int, dones, returns, info):
+                idxs = jnp.arange(max_steps)
+                @partial(jax.vmap, in_axes=(0, 0))
+                def __ep_outcomes(start_idx, end_idx):
+                    mask = (idxs > start_idx) & (idxs <= end_idx) & (end_idx != max_steps)
+                    r = jnp.sum(returns * mask)
+                    goal_r = info["GoalR"]  # (returns > 0) * 1.0
+                    success = jnp.sum(goal_r * mask)
+                    l = end_idx - start_idx
+                    return r, success, l
+                done_idxs = jnp.argwhere(dones, size=50, fill_value=max_steps).squeeze()
+                mask_done = jnp.where(done_idxs == max_steps, 0, 1)
+                ep_return, success, length = __ep_outcomes(
+                    jnp.concatenate([jnp.array([-1]), done_idxs[:-1]]), done_idxs
+                )
+                return {
+                    "ep_return": ep_return.mean(where=mask_done),
+                    "num_episodes": mask_done.sum(),
+                    "success_rate": success.mean(where=mask_done),
+                    "ep_len": length.mean(where=mask_done),
+                }
+            # sample envs
+            rng, _rng, _rng2 = jax.random.split(rng, 3)
+            rng_reset = jax.random.split(_rng, config["num_to_save"])
+            rng_levels = jax.random.split(_rng2, config["num_to_save"])
+            # obsv, env_state = jax.vmap(sample_random_level, in_axes=(0,))(reset_rng)
+            # new_levels = jax.vmap(sample_random_level)(rng_levels)
+            obsv, env_state = jax.vmap(env.reset_to_level, in_axes=(0, 0, None))(rng_reset, instances, env_params)
+            # env_instances = new_levels
+            init_hstate = ScannedRNN.initialize_carry(
+                BATCH_ACTORS,
+            )
+            runner_state = (env_state, env_state, obsv, jnp.zeros((BATCH_ACTORS), dtype=bool), init_hstate, rng)
+            runner_state, traj_batch = jax.lax.scan(_env_step, runner_state, None, config["rollout_steps"])
+            done_by_env = traj_batch.done.reshape((-1, config["num_to_save"]))
+            reward_by_env = traj_batch.reward.reshape((-1, config["num_to_save"]))
+            # info_by_actor = jax.tree.map(lambda x: x.swapaxes(2, 1).reshape((-1, BATCH_ACTORS)), traj_batch.info)
+            o = _calc_outcomes_by_agent(config["rollout_steps"], traj_batch.done, traj_batch.reward, traj_batch.info)
+            success_by_env = o["success_rate"].reshape((1, config["num_to_save"]))
+            learnability_by_env = (success_by_env * (1 - success_by_env)).sum(axis=0)
+            return None, (learnability_by_env, success_by_env.sum(axis=0))
+        rngs = jax.random.split(rng, 1)
+        _, (learnability, success_by_env) = jax.lax.scan(_batch_step, None, rngs, 1)
+        return learnability[0], success_by_env[0]
+    num_eval_levels = len(config["eval_levels"])
+    all_eval_levels = get_eval_levels(config["eval_levels"], eval_env.static_env_params)
+    eval_group_indices = get_eval_level_groups(config["eval_levels"])
+    print("group indices", eval_group_indices)
+    @jax.jit
+    def get_learnability_set(rng, network_params):
+        BATCH_ACTORS = config["batch_size"]
+        def _batch_step(unused, rng):
+            def _env_step(runner_state, unused):
+                env_state, start_state, last_obs, last_done, hstate, rng = runner_state
+                # SELECT ACTION
+                rng, _rng = jax.random.split(rng)
+                obs_batch = last_obs
+                ac_in = (
+                    jax.tree.map(lambda x: x[np.newaxis, :], obs_batch),
+                    last_done[np.newaxis, :],
+                )
+                hstate, pi, value = network.apply(network_params, hstate, ac_in)
+                action = pi.sample(seed=_rng).squeeze()
+                log_prob = pi.log_prob(action)
+                env_act = action
+                # STEP ENV
+                rng, _rng = jax.random.split(rng)
+                rng_step = jax.random.split(_rng, config["batch_size"])
+                obsv, env_state, reward, done, info = jax.vmap(env.step, in_axes=(0, 0, 0, None))(
+                    rng_step, env_state, env_act, env_params
+                )
+                done_batch = done
+                transition = Transition(
+                    done,
+                    last_done,
+                    action.squeeze(),
+                    value.squeeze(),
+                    reward,
+                    log_prob.squeeze(),
+                    obs_batch,
+                    info,
+                )
+                runner_state = (env_state, start_state, obsv, done_batch, hstate, rng)
+                return runner_state, transition
+            @partial(jax.vmap, in_axes=(None, 1, 1, 1))
+            @partial(jax.jit, static_argnums=(0,))
+            def _calc_outcomes_by_agent(max_steps: int, dones, returns, info):
+                idxs = jnp.arange(max_steps)
+                @partial(jax.vmap, in_axes=(0, 0))
+                def __ep_outcomes(start_idx, end_idx):
+                    mask = (idxs > start_idx) & (idxs <= end_idx) & (end_idx != max_steps)
+                    r = jnp.sum(returns * mask)
+                    goal_r = info["GoalR"]  # (returns > 0) * 1.0
+                    success = jnp.sum(goal_r * mask)
+                    l = end_idx - start_idx
+                    return r, success, l
+                done_idxs = jnp.argwhere(dones, size=50, fill_value=max_steps).squeeze()
+                mask_done = jnp.where(done_idxs == max_steps, 0, 1)
+                ep_return, success, length = __ep_outcomes(
+                    jnp.concatenate([jnp.array([-1]), done_idxs[:-1]]), done_idxs
+                )
+                return {
+                    "ep_return": ep_return.mean(where=mask_done),
+                    "num_episodes": mask_done.sum(),
+                    "success_rate": success.mean(where=mask_done),
+                    "ep_len": length.mean(where=mask_done),
+                }
+            # sample envs
+            rng, _rng, _rng2 = jax.random.split(rng, 3)
+            rng_reset = jax.random.split(_rng, config["batch_size"])
+            new_levels = sample_random_levels(_rng2, config["batch_size"])
+            obsv, env_state = jax.vmap(env.reset_to_level, in_axes=(0, 0, None))(rng_reset, new_levels, env_params)
+            env_instances = new_levels
+            init_hstate = ScannedRNN.initialize_carry(
+                BATCH_ACTORS,
+            )
+            runner_state = (env_state, env_state, obsv, jnp.zeros((BATCH_ACTORS), dtype=bool), init_hstate, rng)
+            runner_state, traj_batch = jax.lax.scan(_env_step, runner_state, None, config["rollout_steps"])
+            done_by_env = traj_batch.done.reshape((-1, config["batch_size"]))
+            reward_by_env = traj_batch.reward.reshape((-1, config["batch_size"]))
+            # info_by_actor = jax.tree.map(lambda x: x.swapaxes(2, 1).reshape((-1, BATCH_ACTORS)), traj_batch.info)
+            o = _calc_outcomes_by_agent(config["rollout_steps"], traj_batch.done, traj_batch.reward, traj_batch.info)
+            success_by_env = o["success_rate"].reshape((1, config["batch_size"]))
+            learnability_by_env = (success_by_env * (1 - success_by_env)).sum(axis=0)
+            return None, (learnability_by_env, success_by_env.sum(axis=0), env_instances)
+        if config["sampled_envs_ratio"] == 0.0:
+            print("Not doing any rollouts because sampled_envs_ratio is 0.0")
+            # Here we have zero envs, so we can literally just sample random ones because there is no point.
+            top_instances = sample_random_levels(_rng, config["num_to_save"])
+            top_success = top_learn = learnability = success_rates = jnp.zeros(config["num_to_save"])
+        else:
+            rngs = jax.random.split(rng, config["num_batches"])
+            _, (learnability, success_rates, env_instances) = jax.lax.scan(
+                _batch_step, None, rngs, config["num_batches"]
+            )
+            flat_env_instances = jax.tree.map(lambda x: x.reshape((-1,) + x.shape[2:]), env_instances)
+            learnability = learnability.flatten() + success_rates.flatten() * 0.001
+            top_1000 = jnp.argsort(learnability)[-config["num_to_save"] :]
+            top_1000_instances = jax.tree.map(lambda x: x.at[top_1000].get(), flat_env_instances)
+            top_learn, top_instances = learnability.at[top_1000].get(), top_1000_instances
+            top_success = success_rates.at[top_1000].get()
+        if config["put_eval_levels_in_buffer"]:
+            top_instances = jax.tree.map(
+                lambda all, new: jnp.concatenate([all[:-num_eval_levels], new], axis=0),
+                top_instances,
+                all_eval_levels.env_state,
+            )
+        log = {
+            "learnability/learnability_sampled_mean": learnability.mean(),
+            "learnability/learnability_sampled_median": jnp.median(learnability),
+            "learnability/learnability_sampled_min": learnability.min(),
+            "learnability/learnability_sampled_max": learnability.max(),
+            "learnability/learnability_selected_mean": top_learn.mean(),
+            "learnability/learnability_selected_median": jnp.median(top_learn),
+            "learnability/learnability_selected_min": top_learn.min(),
+            "learnability/learnability_selected_max": top_learn.max(),
+            "learnability/solve_rate_sampled_mean": top_success.mean(),
+            "learnability/solve_rate_sampled_median": jnp.median(top_success),
+            "learnability/solve_rate_sampled_min": top_success.min(),
+            "learnability/solve_rate_sampled_max": top_success.max(),
+            "learnability/solve_rate_selected_mean": success_rates.mean(),
+            "learnability/solve_rate_selected_median": jnp.median(success_rates),
+            "learnability/solve_rate_selected_min": success_rates.min(),
+            "learnability/solve_rate_selected_max": success_rates.max(),
+        }
+        return top_learn, top_instances, log
+    def eval(rng: chex.PRNGKey, train_state: TrainState, keep_states=True):
+        """
+        This evaluates the current policy on the set of evaluation levels specified by config["eval_levels"].
+        It returns (states, cum_rewards, episode_lengths), with shapes (num_steps, num_eval_levels, ...), (num_eval_levels,), (num_eval_levels,)
+        """
+        num_levels = len(config["eval_levels"])
+        # eval_levels = get_eval_levels(config["eval_levels"], eval_env.static_env_params)
+        return general_eval(
+            rng,
+            eval_env,
+            env_params,
+            train_state,
+            all_eval_levels,
+            env_params.max_timesteps,
+            num_levels,
+            keep_states=keep_states,
+            return_trajectories=True,
+        )
+    def eval_on_dr_levels(rng: chex.PRNGKey, train_state: TrainState, keep_states=False):
+        return general_eval(
+            rng,
+            env,
+            env_params,
+            train_state,
+            DR_EVAL_LEVELS,
+            env_params.max_timesteps,
+            NUM_EVAL_DR_LEVELS,
+            keep_states=keep_states,
+        )
+    def eval_on_top_learnable_levels(rng: chex.PRNGKey, train_state: TrainState, levels, keep_states=True):
+        N = 5
+        return general_eval(
+            rng,
+            env,
+            env_params,
+            train_state,
+            jax.tree.map(lambda x: x[:N], levels),
+            env_params.max_timesteps,
+            N,
+            keep_states=keep_states,
+        )
+    # TRAIN LOOP
+    def train_step(runner_state_instances, unused):
+        # COLLECT TRAJECTORIES
+        runner_state, instances = runner_state_instances
+        num_env_instances = instances.polygon.position.shape[0]
+        def _env_step(runner_state, unused):
+            train_state, env_state, start_state, last_obs, last_done, hstate, update_steps, rng = runner_state
+            # SELECT ACTION
+            rng, _rng = jax.random.split(rng)
+            obs_batch = last_obs
+            ac_in = (
+                jax.tree.map(lambda x: x[np.newaxis, :], obs_batch),
+                last_done[np.newaxis, :],
+            )
+            hstate, pi, value = network.apply(train_state.params, hstate, ac_in)
+            action = pi.sample(seed=_rng).squeeze()
+            log_prob = pi.log_prob(action)
+            env_act = action
+            # STEP ENV
+            rng, _rng = jax.random.split(rng)
+            rng_step = jax.random.split(_rng, config["num_train_envs"])
+            obsv, env_state, reward, done, info = jax.vmap(env.step, in_axes=(0, 0, 0, None))(
+                rng_step, env_state, env_act, env_params
+            )
+            done_batch = done
+            transition = Transition(
+                done,
+                last_done,
+                action.squeeze(),
+                value.squeeze(),
+                reward,
+                log_prob.squeeze(),
+                obs_batch,
+                info,
+            )
+            runner_state = (train_state, env_state, start_state, obsv, done_batch, hstate, update_steps, rng)
+            return runner_state, (transition)
+        initial_hstate = runner_state[-3]
+        runner_state, traj_batch = jax.lax.scan(_env_step, runner_state, None, config["num_steps"])
+        # CALCULATE ADVANTAGE
+        train_state, env_state, start_state, last_obs, last_done, hstate, update_steps, rng = runner_state
+        last_obs_batch = last_obs  # batchify(last_obs, env.agents, config["num_train_envs"])
+        ac_in = (
+            jax.tree.map(lambda x: x[np.newaxis, :], last_obs_batch),
+            last_done[np.newaxis, :],
+        )
+        _, _, last_val = network.apply(train_state.params, hstate, ac_in)
+        last_val = last_val.squeeze()
+        def _calculate_gae(traj_batch, last_val):
+            def _get_advantages(gae_and_next_value, transition: Transition):
+                gae, next_value = gae_and_next_value
+                done, value, reward = (
+                    transition.global_done,
+                    transition.value,
+                    transition.reward,
+                )
+                delta = reward + config["gamma"] * next_value * (1 - done) - value
+                gae = delta + config["gamma"] * config["gae_lambda"] * (1 - done) * gae
+                return (gae, value), gae
+            _, advantages = jax.lax.scan(
+                _get_advantages,
+                (jnp.zeros_like(last_val), last_val),
+                traj_batch,
+                reverse=True,
+                unroll=16,
+            )
+            return advantages, advantages + traj_batch.value
+        advantages, targets = _calculate_gae(traj_batch, last_val)
+        # UPDATE NETWORK
+        def _update_epoch(update_state, unused):
+            def _update_minbatch(train_state, batch_info):
+                init_hstate, traj_batch, advantages, targets = batch_info
+                def _loss_fn_masked(params, init_hstate, traj_batch, gae, targets):
+                    # RERUN NETWORK
+                    _, pi, value = network.apply(
+                        params,
+                        jax.tree.map(lambda x: x.transpose(), init_hstate),
+                        (traj_batch.obs, traj_batch.done),
+                    )
+                    log_prob = pi.log_prob(traj_batch.action)
+                    # CALCULATE VALUE LOSS
+                    value_pred_clipped = traj_batch.value + (value - traj_batch.value).clip(
+                        -config["clip_eps"], config["clip_eps"]
+                    )
+                    value_losses = jnp.square(value - targets)
+                    value_losses_clipped = jnp.square(value_pred_clipped - targets)
+                    value_loss = 0.5 * jnp.maximum(value_losses, value_losses_clipped)
+                    critic_loss = config["vf_coef"] * value_loss.mean()
+                    # CALCULATE ACTOR LOSS
+                    logratio = log_prob - traj_batch.log_prob
+                    ratio = jnp.exp(logratio)
+                    # if env.do_sep_reward: gae = gae.sum(axis=-1)
+                    gae = (gae - gae.mean()) / (gae.std() + 1e-8)
+                    loss_actor1 = ratio * gae
+                    loss_actor2 = (
+                        jnp.clip(
+                            ratio,
+                            1.0 - config["clip_eps"],
+                            1.0 + config["clip_eps"],
+                        )
+                        * gae
+                    )
+                    loss_actor = -jnp.minimum(loss_actor1, loss_actor2)
+                    loss_actor = loss_actor.mean()
+                    entropy = pi.entropy().mean()
+                    approx_kl = jax.lax.stop_gradient(((ratio - 1) - logratio).mean())
+                    clipfrac = jax.lax.stop_gradient((jnp.abs(ratio - 1) > config["clip_eps"]).mean())
+                    total_loss = loss_actor + critic_loss - config["ent_coef"] * entropy
+                    return total_loss, (value_loss, loss_actor, entropy, ratio, approx_kl, clipfrac)
+                grad_fn = jax.value_and_grad(_loss_fn_masked, has_aux=True)
+                total_loss, grads = grad_fn(train_state.params, init_hstate, traj_batch, advantages, targets)
+                train_state = train_state.apply_gradients(grads=grads)
+                return train_state, total_loss
+            (
+                train_state,
+                init_hstate,
+                traj_batch,
+                advantages,
+                targets,
+                rng,
+            ) = update_state
+            rng, _rng = jax.random.split(rng)
+            init_hstate = jax.tree.map(lambda x: jnp.reshape(x, (256, config["num_train_envs"])), init_hstate)
+            batch = (
+                init_hstate,
+                traj_batch,
+                advantages.squeeze(),
+                targets.squeeze(),
+            )
+            permutation = jax.random.permutation(_rng, config["num_train_envs"])
+            shuffled_batch = jax.tree_util.tree_map(lambda x: jnp.take(x, permutation, axis=1), batch)
+            minibatches = jax.tree_util.tree_map(
+                lambda x: jnp.swapaxes(
+                    jnp.reshape(
+                        x,
+                        [x.shape[0], config["num_minibatches"], -1] + list(x.shape[2:]),
+                    ),
+                    1,
+                    0,
+                ),
+                shuffled_batch,
+            )
+            train_state, total_loss = jax.lax.scan(_update_minbatch, train_state, minibatches)
+            # total_loss = jax.tree.map(lambda x: x.mean(), total_loss)
+            update_state = (
+                train_state,
+                init_hstate,
+                traj_batch,
+                advantages,
+                targets,
+                rng,
+            )
+            return update_state, total_loss
+        # init_hstate = initial_hstate[None, :].squeeze().transpose()
+        init_hstate = jax.tree.map(lambda x: x[None, :].squeeze().transpose(), initial_hstate)
+        update_state = (
+            train_state,
+            init_hstate,
+            traj_batch,
+            advantages,
+            targets,
+            rng,
+        )
+        update_state, loss_info = jax.lax.scan(_update_epoch, update_state, None, config["update_epochs"])
+        train_state = update_state[0]
+        metric = traj_batch.info
+        metric = jax.tree.map(
+            lambda x: x.reshape((config["num_steps"], config["num_train_envs"])),  # , env.num_agents
+            traj_batch.info,
+        )
+        rng = update_state[-1]
+        def callback(metric):
+            dones = metric["dones"]
+            wandb.log(
+                {
+                    "episode_return": (metric["returned_episode_returns"] * dones).sum() / jnp.maximum(1, dones.sum()),
+                    "episode_solved": (metric["returned_episode_solved"] * dones).sum() / jnp.maximum(1, dones.sum()),
+                    "episode_length": (metric["returned_episode_lengths"] * dones).sum() / jnp.maximum(1, dones.sum()),
+                    "timing/num_env_steps": int(
+                        int(metric["update_steps"]) * int(config["num_train_envs"]) * int(config["num_steps"])
+                    ),
+                    "timing/num_updates": metric["update_steps"],
+                    **metric["loss_info"],
+                }
+            )
+        loss_info = jax.tree.map(lambda x: x.mean(), loss_info)
+        metric["loss_info"] = {
+            "loss/total_loss": loss_info[0],
+            "loss/value_loss": loss_info[1][0],
+            "loss/policy_loss": loss_info[1][1],
+            "loss/entropy_loss": loss_info[1][2],
+        }
+        metric["dones"] = traj_batch.done
+        metric["update_steps"] = update_steps
+        jax.experimental.io_callback(callback, None, metric)
+        # SAMPLE NEW ENVS
+        rng, _rng, _rng2 = jax.random.split(rng, 3)
+        rng_reset = jax.random.split(_rng, config["num_envs_to_generate"])
+        new_levels = sample_random_levels(_rng2, config["num_envs_to_generate"])
+        obsv_gen, env_state_gen = jax.vmap(env.reset_to_level, in_axes=(0, 0, None))(rng_reset, new_levels, env_params)
+        rng, _rng, _rng2 = jax.random.split(rng, 3)
+        sampled_env_instances_idxs = jax.random.randint(_rng, (config["num_envs_from_sampled"],), 0, num_env_instances)
+        sampled_env_instances = jax.tree.map(lambda x: x.at[sampled_env_instances_idxs].get(), instances)
+        myrng = jax.random.split(_rng2, config["num_envs_from_sampled"])
+        obsv_sampled, env_state_sampled = jax.vmap(env.reset_to_level, in_axes=(0, 0))(myrng, sampled_env_instances)
+        obsv = jax.tree.map(lambda x, y: jnp.concatenate([x, y], axis=0), obsv_gen, obsv_sampled)
+        env_state = jax.tree.map(lambda x, y: jnp.concatenate([x, y], axis=0), env_state_gen, env_state_sampled)
+        start_state = env_state
+        hstate = ScannedRNN.initialize_carry(config["num_train_envs"])
+        update_steps = update_steps + 1
+        runner_state = (
+            train_state,
+            env_state,
+            start_state,
+            obsv,
+            jnp.zeros((config["num_train_envs"]), dtype=bool),
+            hstate,
+            update_steps,
+            rng,
+        )
+        return (runner_state, instances), metric
+    def log_buffer(learnability, levels, epoch):
+        num_samples = levels.polygon.position.shape[0]
+        states = levels
+        rows = 2
+        fig, axes = plt.subplots(rows, int(num_samples / rows), figsize=(20, 10))
+        axes = axes.flatten()
+        all_imgs = jax.vmap(render_fn)(states)
+        for i, ax in enumerate(axes):
+            # ax.imshow(train_state.plr_buffer.get_sample(i))
+            score = learnability[i]
+            ax.imshow(all_imgs[i] / 255.0)
+            ax.set_xticks([])
+            ax.set_yticks([])
+            ax.set_title(f"learnability: {score:.3f}")
+            ax.set_aspect("equal", "box")
+        plt.tight_layout()
+        fig.canvas.draw()
+        im = Image.frombytes("RGB", fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+        plt.close()
+        return {"maps": wandb.Image(im)}
+    @jax.jit
+    def train_and_eval_step(runner_state, eval_rng):
+        learnability_rng, eval_singleton_rng, eval_sampled_rng, _rng = jax.random.split(eval_rng, 4)
+        # TRAIN
+        learnabilty_scores, instances, test_metrics = get_learnability_set(learnability_rng, runner_state[0].params)
+        if config["log_learnability_before_after"]:
+            learn_scores_before, success_score_before = log_buffer_learnability(
+                learnability_rng, runner_state[0], instances
+            )
+        print("instance size", sum(x.size for x in jax.tree_util.tree_leaves(instances)))
+        runner_state_instances = (runner_state, instances)
+        runner_state_instances, metrics = jax.lax.scan(train_step, runner_state_instances, None, config["eval_freq"])
+        if config["log_learnability_before_after"]:
+            learn_scores_after, success_score_after = log_buffer_learnability(
+                learnability_rng, runner_state_instances[0][0], instances
+            )
+        # EVAL
+        rng, rng_eval = jax.random.split(eval_singleton_rng)
+        (states, cum_rewards, _, episode_lengths, eval_infos), (eval_dones, eval_rewards) = jax.vmap(eval, (0, None))(
+            jax.random.split(rng_eval, config["eval_num_attempts"]), runner_state_instances[0][0]
+        )
+        all_eval_eplens = episode_lengths
+        # Collect Metrics
+        eval_returns = cum_rewards.mean(axis=0)  # (num_eval_levels,)
+        eval_solves = (eval_infos["returned_episode_solved"] * eval_dones).sum(axis=1) / jnp.maximum(
+            1, eval_dones.sum(axis=1)
+        )
+        eval_solves = eval_solves.mean(axis=0)
+        # just grab the first run
+        states, episode_lengths = jax.tree_util.tree_map(
+            lambda x: x[0], (states, episode_lengths)
+        )  # (num_steps, num_eval_levels, ...), (num_eval_levels,)
+        # And one attempt
+        states = jax.tree_util.tree_map(lambda x: x[:, :], states)
+        episode_lengths = episode_lengths[:]
+        images = jax.vmap(jax.vmap(render_fn_eval))(
+            states.env_state.env_state.env_state
+        )  # (num_steps, num_eval_levels, ...)
+        frames = images.transpose(
+            0, 1, 4, 2, 3
+        )  # WandB expects color channel before image dimensions when dealing with animations for some reason
+        test_metrics["update_count"] = runner_state[-2]
+        test_metrics["eval_returns"] = eval_returns
+        test_metrics["eval_ep_lengths"] = episode_lengths
+        test_metrics["eval_animation"] = (frames, episode_lengths)
+        # Eval on sampled
+        dr_states, dr_cum_rewards, _, dr_episode_lengths, dr_infos = jax.vmap(eval_on_dr_levels, (0, None))(
+            jax.random.split(rng_eval, config["eval_num_attempts"]), runner_state_instances[0][0]
+        )
+        eval_dr_returns = dr_cum_rewards.mean(axis=0).mean()
+        eval_dr_eplen = dr_episode_lengths.mean(axis=0).mean()
+        test_metrics["eval/mean_eval_return_sampled"] = eval_dr_returns
+        my_eval_dones = dr_infos["returned_episode"]
+        eval_dr_solves = (dr_infos["returned_episode_solved"] * my_eval_dones).sum(axis=1) / jnp.maximum(
+            1, my_eval_dones.sum(axis=1)
+        )
+        test_metrics["eval/mean_eval_solve_rate_sampled"] = eval_dr_solves
+        test_metrics["eval/mean_eval_eplen_sampled"] = eval_dr_eplen
+        # Collect Metrics
+        eval_returns = cum_rewards.mean(axis=0)  # (num_eval_levels,)
+        log_dict = {}
+        log_dict["to_remove"] = {
+            "eval_return": eval_returns,
+            "eval_solve_rate": eval_solves,
+            "eval_eplen": all_eval_eplens,
+        }
+        for i, name in enumerate(config["eval_levels"]):
+            log_dict[f"eval_avg_return/{name}"] = eval_returns[i]
+            log_dict[f"eval_avg_solve_rate/{name}"] = eval_solves[i]
+        log_dict.update({"eval/mean_eval_return": eval_returns.mean()})
+        log_dict.update({"eval/mean_eval_solve_rate": eval_solves.mean()})
+        log_dict.update({"eval/mean_eval_eplen": all_eval_eplens.mean()})
+        test_metrics.update(log_dict)
+        runner_state, _ = runner_state_instances
+        test_metrics["update_count"] = runner_state[-2]
+        top_instances = jax.tree.map(lambda x: x.at[-5:].get(), instances)
+        # Eval on top learnable levels
+        tl_states, tl_cum_rewards, _, tl_episode_lengths, tl_infos = jax.vmap(
+            eval_on_top_learnable_levels, (0, None, None)
+        )(jax.random.split(rng_eval, config["eval_num_attempts"]), runner_state_instances[0][0], top_instances)
+        # just grab the first run
+        states, episode_lengths = jax.tree_util.tree_map(
+            lambda x: x[0], (tl_states, tl_episode_lengths)
+        )  # (num_steps, num_eval_levels, ...), (num_eval_levels,)
+        # And one attempt
+        states = jax.tree_util.tree_map(lambda x: x[:, :], states)
+        episode_lengths = episode_lengths[:]
+        images = jax.vmap(jax.vmap(render_fn))(
+            states.env_state.env_state.env_state
+        )  # (num_steps, num_eval_levels, ...)
+        frames = images.transpose(
+            0, 1, 4, 2, 3
+        )  # WandB expects color channel before image dimensions when dealing with animations for some reason
+        test_metrics["top_learnable_animation"] = (frames, episode_lengths, tl_cum_rewards)
+        if config["log_learnability_before_after"]:
+            def single(x, name):
+                return {
+                    f"{name}_mean": x.mean(),
+                    f"{name}_std": x.std(),
+                    f"{name}_min": x.min(),
+                    f"{name}_max": x.max(),
+                    f"{name}_median": jnp.median(x),
+                }
+            test_metrics["learnability_log_v2/"] = {
+                **single(learn_scores_before, "learnability_before"),
+                **single(learn_scores_after, "learnability_after"),
+                **single(success_score_before, "success_score_before"),
+                **single(success_score_after, "success_score_after"),
+            }
+        return runner_state, (learnabilty_scores.at[-20:].get(), top_instances), test_metrics
+    rng, _rng = jax.random.split(rng)
+    runner_state = (
+        train_state,
+        env_state,
+        start_state,
+        obsv,
+        jnp.zeros((config["num_train_envs"]), dtype=bool),
+        init_hstate,
+        0,
+        _rng,
+    )
+    def log_eval(stats):
+        log_dict = {}
+        to_remove = stats["to_remove"]
+        del stats["to_remove"]
+        def _aggregate_per_size(values, name):
+            to_return = {}
+            for group_name, indices in eval_group_indices.items():
+                to_return[f"{name}_{group_name}"] = values[indices].mean()
+            return to_return
+        env_steps = stats["update_count"] * config["num_train_envs"] * config["num_steps"]
+        env_steps_delta = config["eval_freq"] * config["num_train_envs"] * config["num_steps"]
+        time_now = time.time()
+        log_dict = {
+            "timing/num_updates": stats["update_count"],
+            "timing/num_env_steps": env_steps,
+            "timing/sps": env_steps_delta / stats["time_delta"],
+            "timing/sps_agg": env_steps / (time_now - time_start),
+        }
+        log_dict.update(_aggregate_per_size(to_remove["eval_return"], "eval_aggregate/return"))
+        log_dict.update(_aggregate_per_size(to_remove["eval_solve_rate"], "eval_aggregate/solve_rate"))
+        for i in range((len(config["eval_levels"]))):
+            frames, episode_length = stats["eval_animation"][0][:, i], stats["eval_animation"][1][i]
+            frames = np.array(frames[:episode_length])
+            log_dict.update(
+                {
+                    f"media/eval_video_{config['eval_levels'][i]}": wandb.Video(
+                        frames.astype(np.uint8), fps=15, caption=f"(len {episode_length})"
+                    )
+                }
+            )
+        for j in range(5):
+            frames, episode_length, cum_rewards = (
+                stats["top_learnable_animation"][0][:, j],
+                stats["top_learnable_animation"][1][j],
+                stats["top_learnable_animation"][2][:, j],
+            )  # num attempts
+            rr = "|".join([f"{r:<.2f}" for r in cum_rewards])
+            frames = np.array(frames[:episode_length])
+            log_dict.update(
+                {
+                    f"media/tl_animation_{j}": wandb.Video(
+                        frames.astype(np.uint8), fps=15, caption=f"(len {episode_length})\n{rr}"
+                    )
+                }
+            )
+        stats.update(log_dict)
+        wandb.log(stats, step=stats["update_count"])
+    checkpoint_steps = config["checkpoint_save_freq"]
+    assert config["num_updates"] % config["eval_freq"] == 0, "num_updates must be divisible by eval_freq"
+    for eval_step in range(int(config["num_updates"] // config["eval_freq"])):
+        start_time = time.time()
+        rng, eval_rng = jax.random.split(rng)
+        runner_state, instances, metrics = train_and_eval_step(runner_state, eval_rng)
+        curr_time = time.time()
+        metrics.update(log_buffer(*instances, metrics["update_count"]))
+        metrics["time_delta"] = curr_time - start_time
+        metrics["steps_per_section"] = (config["eval_freq"] * config["num_steps"] * config["num_train_envs"]) / metrics[
+            "time_delta"
+        ]
+        log_eval(metrics)
+        if ((eval_step + 1) * config["eval_freq"]) % checkpoint_steps == 0:
+            if config["save_path"] is not None:
+                steps = int(metrics["update_count"]) * int(config["num_train_envs"]) * int(config["num_steps"])
+                # save_params_to_wandb(runner_state[0].params, steps, config)
+                save_model_to_wandb(runner_state[0], steps, config)
+    if config["save_path"] is not None:
+        # save_params_to_wandb(runner_state[0].params, config["total_timesteps"], config)
+        save_model_to_wandb(runner_state[0], config["total_timesteps"], config)
+if __name__ == "__main__":
+    # with jax.disable_jit():
+    #     main()
+    main()

Kinetix/images/bb.gif ADDED Viewed

Kinetix/images/cartpole.gif ADDED Viewed