import glob import os import gymnasium as gym import numpy as np from gymnasium.wrappers import RecordVideo from moviepy.video.compositing.concatenate import concatenate_videoclips from moviepy.video.io.VideoFileClip import VideoFileClip from sympy import latex from interpretable import InterpretablePolicyExtractor from utils import generate_dataset_from_expert, rollouts import matplotlib.pyplot as plt import torch import gradio as gr intro = """ # Making RL Policy Interpretable with Kolmogorov-Arnold Network 🧠 ➙ 🔢 Waris Radji1, Corentin Léger2, Hector Kohler1 1[Inria, team Scool](https://team.inria.fr/scool/) 2[Inria, team Flowers](https://flowers.inria.fr/) In this demo, we showcase a method to make a trained Reinforcement Learning (RL) policy interpretable using the Kolmogorov-Arnold Network (KAN). The process involves transferring the knowledge from a pre-trained RL policy to a KAN. We achieve this by training the KAN to map actions from observations obtained from trajectories of the pre-trained policy. ## Procedure - Train the KAN using observations from trajectories generated by a pre-trained RL policy, the KAN learns to map observations to corresponding actions. - Apply symbolic regression algorithms to the KAN's learned mapping. - Extract an interpretable policy expressed in symbolic form. For more information about KAN you can read the [paper](https://arxiv.org/abs/2404.19756), and check the [PyTorch official information](https://github.com/KindXiaoming/pykan). To follow the progress of KAN in RL you can check the repo [kanrl](https://github.com/riiswa/kanrl) (you can run this app locally). [![riiswa/kanrl - GitHub](https://gh-card.dev/repos/riiswa/kanrl.svg)](https://github.com/riiswa/kanrl) *Please be patient, as the process may take a few minutes to run, especially in environments with large state/action spaces or with a complex KAN architecture. For optimal performance, default parameters may not suffice. Feel free to experiment with different settings to achieve desired results.* """ envs = ["CartPole-v1", "MountainCar-v0", "Acrobot-v1", "Pendulum-v1", "MountainCarContinuous-v0", "LunarLander-v2", "Swimmer-v3", "Hopper-v3", "HalfCheetah-v3", "Walker2d-v3"] if __name__ == "__main__": torch.set_default_dtype(torch.float32) def load_video_and_dataset(_env_name): env_name = _env_name agent = "ppo" if env_name == "Swimmer-v3" or env_name == "Walker2d-v3": agent = "trpo" dataset_path, video_path = generate_dataset_from_expert(agent, _env_name, 15, 3) return video_path, gr.Button("Compute the symbolic policy!", interactive=True), { "dataset_path": dataset_path, "ipe": None, "env_name": env_name } def parse_integer_list(input_str): if not input_str or input_str.isspace(): return None elements = input_str.split(',') try: int_list = tuple([int(elem.strip()) for elem in elements]) return int_list except ValueError: return False def extract_interpretable_policy(kan_widths, epochs, state): widths = parse_integer_list(kan_widths) if kan_widths is False: gr.Warning(f"Please enter widths {kan_widths} in the right format... The current run is executed with no hidden layer.") widths = None state["ipe"] = InterpretablePolicyExtractor(state["env_name"], widths) state["ipe"].train_from_dataset(state["dataset_path"], steps=epochs) state["ipe"].policy.prune() state["ipe"].policy.plot(mask=True, scale=5) fig = plt.gcf() fig.canvas.draw() kan_architecture = np.array(fig.canvas.renderer.buffer_rgba()) plt.close() return kan_architecture, state, fig def symbolic_policy(state): lib = ['x', 'x^2', 'x^3', 'x^4', 'exp', 'log', 'sqrt', 'tanh', 'sin', 'abs'] state["ipe"].policy.auto_symbolic(lib=lib) env = gym.make(state["env_name"], render_mode="rgb_array") env = RecordVideo(env, video_folder="videos", episode_trigger=lambda x: True, name_prefix=f"""kan-{state["env_name"]}""") rollouts(env, state["ipe"].forward, 2) video_path = os.path.join("videos", f"""kan-{state["env_name"]}.mp4""") video_files = glob.glob(os.path.join("videos", f"""kan-{state["env_name"]}-episode*.mp4""")) clips = [VideoFileClip(file) for file in video_files] final_clip = concatenate_videoclips(clips) final_clip.write_videofile(video_path, codec="libx264", fps=24) symbolic_formula = f"### The symbolic formula of the policy is:" formulas = state["ipe"].policy.symbolic_formula()[0] for i, formula in enumerate(formulas): symbolic_formula += "\n$$ a_" + str(i) + "=" + latex(formula) + "$$" if state["ipe"]._action_is_discrete: symbolic_formula += "\n" + r"$$ a = \underset{i}{\mathrm{argmax}} \ a_i.$$" return video_path, symbolic_formula css = """ #formula {overflow-x: auto!important}; """ with gr.Blocks(theme='gradio/monochrome', css=css) as app: state = gr.State({ "dataset_path": None, "ipe": None, "env_name": None }) gr.Markdown(intro) with gr.Row(): with gr.Column(): gr.Markdown("### Pretrained policy loading (PPO or TRPO from [rl-baselines3-zoo](https://github.com/DLR-RM/rl-baselines3-zoo))") choice = gr.Dropdown(envs, label="Environment name") expert_video = gr.Video(label="Expert policy video", interactive=False, autoplay=True) kan_widths = gr.Textbox(value="2", label="Widths of the hidden layers of the KAN, separated by commas (e.g. `3,3`). Leave empty if there are no hidden layers.") epochs = gr.Number(value=20, label="KAN training Steps.", minimum=1, maximum=100) button = gr.Button("Compute the symbolic policy!", interactive=False) with gr.Column(): gr.Markdown("### Symbolic policy extraction") kan_architecture = gr.Image(interactive=False, label="KAN architecture") sym_video = gr.Video(label="Symbolic policy video", interactive=False, autoplay=True) sym_formula = gr.Markdown(elem_id="formula") choice.input(load_video_and_dataset, inputs=[choice], outputs=[expert_video, button, state]) button.click(extract_interpretable_policy, inputs=[kan_widths, epochs, state], outputs=[kan_architecture, state]).then( symbolic_policy, inputs=[state], outputs=[sym_video, sym_formula] ) app.launch()