{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "nwaAZRu1NTiI" }, "source": [ "# DQN\n", "\n", "#### This version implements DQN with Keras\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Hi everybody, I just finished coding a DQN from scratch that can solve CartPole-v1. https://huggingface.co/bonadio/rl-fin/blob/main/DQN_v1.ipynb \n", "It takes 6000 steps, here is the result https://huggingface.co/bonadio/rl-fin/blob/main/DQN_v1_result.mp4.\n", "\n", "I can say that coding a RL algorithm is really challenge. The biggest difficult is that there are a lot of parameters to tune and it is hard to know when you are on the right track. \n", "For me, the DQN main points where: \n", "1- number of layers of the NN I started with only 2 layers of 64, but had to grow to 3 layers with 512,256,128\n", "\n", "2- NN frequency update, when you copy the weights from one NN to the other. I started with 50 steps, but end up with 5, the more often you update the better.\n", "\n", "3- Size of the batch, the number of samples that you take from the replay memory, seems that the bigger the best. I started with 10 and end up with 100. \n", "\n", "4- Epsilon decay, seen that the network only really starts to learn after it stops taking random positions. A very small value as end value is good, I used 0.001\n", "\n", "5- The size of the memory replay does not play a big difference I used 10.000 but I think it could be smaller like 5.000\n", "\n", "Now that I have a working DQN I will adapt it to trade Ethereum. I will convert my code from Q-learning to DQN. " ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "LNXxxKojNTiL" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-12-22 18:43:04.111595: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "\n" ] } ], "source": [ "import tensorflow as tf\n", "from tensorflow.keras import layers\n", "from tensorflow.keras.utils import to_categorical\n", "import gym\n", "from gym import spaces\n", "from gym.utils import seeding\n", "from gym import wrappers\n", "\n", "from tqdm.notebook import tqdm\n", "from collections import deque\n", "import numpy as np\n", "import random\n", "from matplotlib import pyplot as plt\n", "from sklearn.preprocessing import MinMaxScaler\n", "\n", "import io\n", "import base64\n", "from IPython.display import HTML, Video\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "class DQN:\n", " def __init__(self, env=None, replay_buffer_size=1000, action_size=2):\n", " self.replay_buffer = deque(maxlen=replay_buffer_size)\n", "\n", " self.action_size = action_size\n", "\n", " # Hyperparameters\n", " self.gamma = 0.95 # Discount rate\n", " self.epsilon = 1.0 # Exploration rate\n", " self.epsilon_min = 0.001 # Minimal exploration rate (epsilon-greedy)\n", " self.epsilon_decay = 0.95 # Decay rate for epsilon\n", " self.update_rate = 5 # Number of steps until updating the target network\n", " self.batch_size = 100\n", " self.learning_rate = 2.5e-4\n", " \n", " # Construct DQN models\n", " self.model = self._build_model()\n", " self.target_model = self._build_model()\n", " self.target_model.set_weights(self.model.get_weights())\n", " self.model.summary()\n", " self.env = env\n", " self.action_size = action_size\n", "\n", " self.scaler = None\n", "\n", " def _build_model(self):\n", " model = tf.keras.Sequential()\n", " \n", " model.add(tf.keras.Input(shape=(4,)))\n", " model.add(layers.Dense(512, activation = 'relu'))\n", " model.add(layers.Dense(256, activation = 'relu'))\n", " model.add(layers.Dense(128, activation = 'relu'))\n", " model.add(layers.Dense(self.action_size, activation = 'linear'))\n", " # model.compile(optimizer = RMSprop(lr = self.lr, rho = 0.95, epsilon = 0.01), loss = \"mse\", metrics = ['accuracy'])\n", " \n", " optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n", " # model.compile(loss='mse', optimizer=tf.keras.optimizers.RMSprop(lr = self.learning_rate, rho = 0.95, epsilon = 0.01), metrics = ['accuracy'])\n", " model.compile(loss='mse', optimizer=optimizer, metrics = ['accuracy'])\n", " return model\n", "\n", " def _min_max(self):\n", " \"\"\"Run some steps to get data to do MINMAX scale \"\"\"\n", " state_arr = []\n", " state = self.env.reset()\n", " state_arr.append(self.env.observation_space.high)\n", " state_arr.append(self.env.observation_space.low)\n", " for i in range(1000):\n", " random_action = self.env.action_space.sample()\n", " next_state, reward, done, info = self.env.step(random_action)\n", " state_arr.append(next_state)\n", " if done:\n", " state = self.env.reset()\n", "\n", " state_arr = np.array(state_arr)\n", " self.scaler = MinMaxScaler()\n", " self.scaler.fit(state_arr)\n", "\n", " #\n", " # Trains the model using randomly selected experiences in the replay memory\n", " #\n", " def _train(self):\n", " X, y = [], []\n", " # state, action, reward, next_state, done \n", " # create the targets \n", " if self.batch_size > len(self.replay_buffer):\n", " return\n", " minibatch = random.sample(self.replay_buffer, self.batch_size)\n", " mb_arr = np.array(minibatch, dtype=object)\n", "\n", " next_state_arr = np.stack(mb_arr[:,3])\n", " future_qvalues = self.target_model.predict(next_state_arr, verbose=0)\n", "\n", " state_arr = np.stack(mb_arr[:,0])\n", " qvalues = self.model.predict(state_arr, verbose=0)\n", "\n", " for index, (state, action, reward, next_state, done) in enumerate(minibatch):\n", " if done == True:\n", " q_target = reward\n", " else:\n", " q_target = reward + self.gamma * np.max(future_qvalues[index])\n", "\n", " q_curr = qvalues[index]\n", " q_curr[action] = q_target \n", " X.append(state)\n", " y.append(q_curr)\n", "\n", " # Perform gradient step\n", " X, y = np.array(X), np.array(y)\n", " history = self.model.fit(X, y, batch_size = self.batch_size, shuffle = False, verbose=0)\n", " # history = self.model.fit(X, y, epochs=1, verbose=0)\n", " # print(f\"Loss: {history.history['loss']} \")\n", "\n", "\n", " def learn(self, total_steps=None):\n", " #create scaler\n", " self._min_max()\n", " current_episode = 0\n", " total_reward = 0\n", " rewards = [0]\n", " current_step = 0\n", " while current_step < total_steps:\n", " current_episode += 1\n", " state = self.env.reset()\n", " total_reward = 0\n", " done = False\n", " while done != True:\n", " current_step +=1\n", " # e-greedy\n", " if np.random.random() > (1 - self.epsilon):\n", " action = random.randrange(self.action_size)\n", " else:\n", " model_predict = self.model.predict(np.array([state]), verbose=0)\n", " action = np.argmax(model_predict)\n", "\n", " # step\n", " next_state, reward, done, info = self.env.step(action)\n", " total_reward += reward\n", "\n", " # add to buffer\n", " self.replay_buffer.append((state, action, reward, next_state, done))\n", "\n", " if current_step>10 and current_step % self.update_rate == 0:\n", " print(f\"epsilon:{self.epsilon} step:{current_step} episode:{current_episode} last_score {rewards[-1]} \")\n", " self._train()\n", " # update target\n", " self.target_model.set_weights(self.model.get_weights())\n", " \n", " state = next_state\n", " \n", " rewards.append(total_reward)\n", " # update epsilon\n", " if self.epsilon > self.epsilon_min:\n", " self.epsilon *= self.epsilon_decay\n", " #\n", " # Loads a saved model\n", " #\n", " def load(self, name):\n", " self.model.load_weights(name)\n", "\n", " #\n", " # Saves parameters of a trained model\n", " #\n", " def save(self, name):\n", " self.model.save_weights(name)\n", "\n", " def play(self, state):\n", " return np.argmax(self.model.predict(np.array([state]), verbose=0)[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "env = gym.make('CartPole-v1')\n", "\n", "model = DQN(env=env, replay_buffer_size=10_000, action_size=2)\n", "model.learn(total_steps=6_000)\n", "env.close()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "model.save(\"./alt/m1.h5\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential_28\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " dense_97 (Dense) (None, 512) 2560 \n", " \n", " dense_98 (Dense) (None, 256) 131328 \n", " \n", " dense_99 (Dense) (None, 128) 32896 \n", " \n", " dense_100 (Dense) (None, 2) 258 \n", " \n", "=================================================================\n", "Total params: 167,042\n", "Trainable params: 167,042\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "Total reward 500.0\n" ] } ], "source": [ "eval_env = gym.make('CartPole-v1')\n", "model = DQN(env=eval_env, replay_buffer_size=10_000, action_size=2)\n", "model.load(\"./alt/m1.h5\")\n", "eval_env = wrappers.Monitor(eval_env, \"./alt/gym-results\", force=True)\n", "state = eval_env.reset()\n", "total_reward = 0\n", "for _ in range(1000):\n", " action = model.play(state)\n", " observation, reward, done, info = eval_env.step(action)\n", " total_reward +=reward\n", " state = observation\n", " if done: \n", " print(f\"Total reward {total_reward}\")\n", " break\n", "eval_env.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3.8.13 ('rl2')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "cd60ab8388a66026f336166410d6a8a46ddf65ece2e85ad2d46c8b98d87580d1" } }, "widgets": { "application/vnd.jupyter.widget-state+json": { "01a2dbcb714e40148b41c761fcf43147": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "20b0f38ec3234ff28a62a286cd57b933": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "PasswordModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "PasswordModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "PasswordView", "continuous_update": true, "description": "Token:", "description_tooltip": null, "disabled": false, "layout": "IPY_MODEL_01a2dbcb714e40148b41c761fcf43147", "placeholder": "", "style": "IPY_MODEL_90c874e91b304ee1a7ef147767ac00ce", "value": "" } }, "270cbb5d6e9c4b1e9e2f39c8b3b0c15f": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "VBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [ "IPY_MODEL_a02224a43d8d4af3bd31d326540d25da", "IPY_MODEL_20b0f38ec3234ff28a62a286cd57b933", "IPY_MODEL_f6c845330d6743c0b35c2c7ad834de77", "IPY_MODEL_f1675c09d16a4251b403f9c56255f168", "IPY_MODEL_c1a82965ae26479a98e4fdbde1e64ec2" ], "layout": "IPY_MODEL_3fa248114ac24656ba74923936a94d2d" } }, "2dc5fa9aa3334dfcbdee9c238f2ef60b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "3e753b0212644990b558c68853ff2041": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3fa248114ac24656ba74923936a94d2d": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": "center", "align_self": null, "border": null, "bottom": null, "display": "flex", "flex": null, "flex_flow": "column", "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "50%" } }, "42d140b838b844819bc127afc1b7bc84": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "90c874e91b304ee1a7ef147767ac00ce": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "9d847f9a7d47458d8cd57d9b599e47c6": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a02224a43d8d4af3bd31d326540d25da": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_caef095934ec47bbb8b64eab22049284", "placeholder": "", "style": "IPY_MODEL_2dc5fa9aa3334dfcbdee9c238f2ef60b", "value": "