{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "nwaAZRu1NTiI" }, "source": [ "# DQN\n", "\n", "#### This version implements DQN with Keras\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "LNXxxKojNTiL" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-12-22 18:43:04.111595: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "\n" ] } ], "source": [ "import tensorflow as tf\n", "from tensorflow.keras import layers\n", "from tensorflow.keras.utils import to_categorical\n", "import gym\n", "from gym import spaces\n", "from gym.utils import seeding\n", "from gym import wrappers\n", "\n", "from tqdm.notebook import tqdm\n", "from collections import deque\n", "import numpy as np\n", "import random\n", "from matplotlib import pyplot as plt\n", "from sklearn.preprocessing import MinMaxScaler\n", "\n", "import io\n", "import base64\n", "from IPython.display import HTML, Video\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "class DQN:\n", " def __init__(self, env=None, replay_buffer_size=1000, action_size=2):\n", " self.replay_buffer = deque(maxlen=replay_buffer_size)\n", "\n", " self.action_size = action_size\n", "\n", " # Hyperparameters\n", " self.gamma = 0.95 # Discount rate\n", " self.epsilon = 1.0 # Exploration rate\n", " self.epsilon_min = 0.001 # Minimal exploration rate (epsilon-greedy)\n", " self.epsilon_decay = 0.95 # Decay rate for epsilon\n", " self.update_rate = 5 # Number of steps until updating the target network\n", " self.batch_size = 100\n", " self.learning_rate = 2.5e-4\n", " \n", " # Construct DQN models\n", " self.model = self._build_model()\n", " self.target_model = self._build_model()\n", " self.target_model.set_weights(self.model.get_weights())\n", " self.model.summary()\n", " self.env = env\n", " self.action_size = action_size\n", "\n", " self.scaler = None\n", "\n", " def _build_model(self):\n", " model = tf.keras.Sequential()\n", " \n", " model.add(tf.keras.Input(shape=(4,)))\n", " model.add(layers.Dense(512, activation = 'relu'))\n", " model.add(layers.Dense(256, activation = 'relu'))\n", " model.add(layers.Dense(128, activation = 'relu'))\n", " model.add(layers.Dense(self.action_size, activation = 'linear'))\n", " # model.compile(optimizer = RMSprop(lr = self.lr, rho = 0.95, epsilon = 0.01), loss = \"mse\", metrics = ['accuracy'])\n", " \n", " optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n", " # model.compile(loss='mse', optimizer=tf.keras.optimizers.RMSprop(lr = self.learning_rate, rho = 0.95, epsilon = 0.01), metrics = ['accuracy'])\n", " model.compile(loss='mse', optimizer=optimizer, metrics = ['accuracy'])\n", " return model\n", "\n", " def _min_max(self):\n", " \"\"\"Run some steps to get data to do MINMAX scale \"\"\"\n", " state_arr = []\n", " state = self.env.reset()\n", " state_arr.append(self.env.observation_space.high)\n", " state_arr.append(self.env.observation_space.low)\n", " for i in range(1000):\n", " random_action = self.env.action_space.sample()\n", " next_state, reward, done, info = self.env.step(random_action)\n", " state_arr.append(next_state)\n", " if done:\n", " state = self.env.reset()\n", "\n", " state_arr = np.array(state_arr)\n", " self.scaler = MinMaxScaler()\n", " self.scaler.fit(state_arr)\n", "\n", " #\n", " # Trains the model using randomly selected experiences in the replay memory\n", " #\n", " def _train(self):\n", " X, y = [], []\n", " # state, action, reward, next_state, done \n", " # create the targets \n", " if self.batch_size > len(self.replay_buffer):\n", " return\n", " minibatch = random.sample(self.replay_buffer, self.batch_size)\n", " mb_arr = np.array(minibatch, dtype=object)\n", "\n", " next_state_arr = np.stack(mb_arr[:,3])\n", " future_qvalues = self.target_model.predict(next_state_arr, verbose=0)\n", "\n", " state_arr = np.stack(mb_arr[:,0])\n", " qvalues = self.model.predict(state_arr, verbose=0)\n", "\n", " for index, (state, action, reward, next_state, done) in enumerate(minibatch):\n", " if done == True:\n", " q_target = reward\n", " else:\n", " q_target = reward + self.gamma * np.max(future_qvalues[index])\n", "\n", " q_curr = qvalues[index]\n", " q_curr[action] = q_target \n", " X.append(state)\n", " y.append(q_curr)\n", "\n", " # Perform gradient step\n", " X, y = np.array(X), np.array(y)\n", " history = self.model.fit(X, y, batch_size = self.batch_size, shuffle = False, verbose=0)\n", " # history = self.model.fit(X, y, epochs=1, verbose=0)\n", " # print(f\"Loss: {history.history['loss']} \")\n", "\n", "\n", " def learn(self, total_steps=None):\n", " #create scaler\n", " self._min_max()\n", " current_episode = 0\n", " total_reward = 0\n", " rewards = [0]\n", " current_step = 0\n", " while current_step < total_steps:\n", " current_episode += 1\n", " state = self.env.reset()\n", " total_reward = 0\n", " done = False\n", " while done != True:\n", " current_step +=1\n", " # e-greedy\n", " if np.random.random() > (1 - self.epsilon):\n", " action = random.randrange(self.action_size)\n", " else:\n", " model_predict = self.model.predict(np.array([state]), verbose=0)\n", " action = np.argmax(model_predict)\n", "\n", " # step\n", " next_state, reward, done, info = self.env.step(action)\n", " total_reward += reward\n", "\n", " # add to buffer\n", " self.replay_buffer.append((state, action, reward, next_state, done))\n", "\n", " if current_step>10 and current_step % self.update_rate == 0:\n", " print(f\"epsilon:{self.epsilon} step:{current_step} episode:{current_episode} last_score {rewards[-1]} \")\n", " self._train()\n", " # update target\n", " self.target_model.set_weights(self.model.get_weights())\n", " \n", " state = next_state\n", " \n", " rewards.append(total_reward)\n", " # update epsilon\n", " if self.epsilon > self.epsilon_min:\n", " self.epsilon *= self.epsilon_decay\n", " #\n", " # Loads a saved model\n", " #\n", " def load(self, name):\n", " self.model.load_weights(name)\n", "\n", " #\n", " # Saves parameters of a trained model\n", " #\n", " def save(self, name):\n", " self.model.save_weights(name)\n", "\n", " def play(self, state):\n", " return np.argmax(self.model.predict(np.array([state]), verbose=0)[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "env = gym.make('CartPole-v1')\n", "\n", "model = DQN(env=env, replay_buffer_size=10_000, action_size=2)\n", "model.learn(total_steps=6_000)\n", "env.close()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "model.save(\"./alt/m1.h5\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential_28\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " dense_97 (Dense) (None, 512) 2560 \n", " \n", " dense_98 (Dense) (None, 256) 131328 \n", " \n", " dense_99 (Dense) (None, 128) 32896 \n", " \n", " dense_100 (Dense) (None, 2) 258 \n", " \n", "=================================================================\n", "Total params: 167,042\n", "Trainable params: 167,042\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "Total reward 500.0\n" ] } ], "source": [ "eval_env = gym.make('CartPole-v1')\n", "model = DQN(env=eval_env, replay_buffer_size=10_000, action_size=2)\n", "model.load(\"./alt/m1.h5\")\n", "eval_env = wrappers.Monitor(eval_env, \"./alt/gym-results\", force=True)\n", "state = eval_env.reset()\n", "total_reward = 0\n", "for _ in range(1000):\n", " action = model.play(state)\n", " observation, reward, done, info = eval_env.step(action)\n", " total_reward +=reward\n", " state = observation\n", " if done: \n", " print(f\"Total reward {total_reward}\")\n", " break\n", "eval_env.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3.8.13 ('rl2')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "cd60ab8388a66026f336166410d6a8a46ddf65ece2e85ad2d46c8b98d87580d1" } }, "widgets": { "application/vnd.jupyter.widget-state+json": { "01a2dbcb714e40148b41c761fcf43147": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "20b0f38ec3234ff28a62a286cd57b933": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "PasswordModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "PasswordModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "PasswordView", "continuous_update": true, "description": "Token:", "description_tooltip": null, "disabled": false, "layout": "IPY_MODEL_01a2dbcb714e40148b41c761fcf43147", "placeholder": "", "style": "IPY_MODEL_90c874e91b304ee1a7ef147767ac00ce", "value": "" } }, "270cbb5d6e9c4b1e9e2f39c8b3b0c15f": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "VBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [ "IPY_MODEL_a02224a43d8d4af3bd31d326540d25da", "IPY_MODEL_20b0f38ec3234ff28a62a286cd57b933", "IPY_MODEL_f6c845330d6743c0b35c2c7ad834de77", "IPY_MODEL_f1675c09d16a4251b403f9c56255f168", "IPY_MODEL_c1a82965ae26479a98e4fdbde1e64ec2" ], "layout": "IPY_MODEL_3fa248114ac24656ba74923936a94d2d" } }, "2dc5fa9aa3334dfcbdee9c238f2ef60b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "3e753b0212644990b558c68853ff2041": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3fa248114ac24656ba74923936a94d2d": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": "center", "align_self": null, "border": null, "bottom": null, "display": "flex", "flex": null, "flex_flow": "column", "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "50%" } }, "42d140b838b844819bc127afc1b7bc84": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "90c874e91b304ee1a7ef147767ac00ce": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "9d847f9a7d47458d8cd57d9b599e47c6": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a02224a43d8d4af3bd31d326540d25da": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_caef095934ec47bbb8b64eab22049284", "placeholder": "", "style": "IPY_MODEL_2dc5fa9aa3334dfcbdee9c238f2ef60b", "value": "