{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "ef458e0c", "metadata": { "papermill": { "duration": 0.002614, "end_time": "2023-09-29T05:06:25.725060", "exception": false, "start_time": "2023-09-29T05:06:25.722446", "status": "completed" }, "tags": [] }, "source": [ "# RWKV v5 multi-size training experiment\n", "\n", "**Note:** This project assumes you have the rwkv-infctx conda env setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "58eb3f3e", "metadata": { "papermill": { "duration": 0.00201, "end_time": "2023-09-29T05:06:25.730966", "exception": false, "start_time": "2023-09-29T05:06:25.728956", "status": "completed" }, "tags": [] }, "source": [ "# Basic Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "e0abbad9", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T05:06:25.737449Z", "iopub.status.busy": "2023-09-29T05:06:25.736495Z", "iopub.status.idle": "2023-09-29T05:06:26.482958Z", "shell.execute_reply": "2023-09-29T05:06:26.482054Z" }, "papermill": { "duration": 0.751859, "end_time": "2023-09-29T05:06:26.485032", "exception": false, "start_time": "2023-09-29T05:06:25.733173", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# First lets setup the various directories, and init the model\n", "!mkdir -p ../../../../model/\n", "!mkdir -p ../../../../datapath/\n", "!mkdir -p ../../../../checkpoint/" ] }, { "cell_type": "code", "execution_count": 2, "id": "42d56a7f", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T05:06:26.491452Z", "iopub.status.busy": "2023-09-29T05:06:26.490928Z", "iopub.status.idle": "2023-09-29T05:06:26.499148Z", "shell.execute_reply": "2023-09-29T05:06:26.498384Z" }, "papermill": { "duration": 0.013307, "end_time": "2023-09-29T05:06:26.500768", "exception": false, "start_time": "2023-09-29T05:06:26.487461", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DEEPSPEED_STRAT: deepspeed_stage_1\n", "ENABLE_WANDB: True\n", "GPU_DEVICES: auto\n", "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train\n", "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" ] } ], "source": [ "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", "GPU_DEVICES=\"auto\"\n", "ENABLE_WANDB=True\n", "\n", "EMBED_SCALE=0.01\n", "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", "\n", "LAYER_COUNT=6\n", "EMBED_SIZE=2048\n", "\n", "WANDB_PREFIX=f\"[Multi-size] v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE}\"\n", "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}\"\n", "\n", "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", "\n", "if ENABLE_WANDB:\n", " WANDB_MODE=\"online\"\n", "else:\n", " WANDB_MODE=\"disabled\"\n", "\n", "# Computing the notebook, and various paths\n", "import os\n", "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "\n", "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", "print(\"PROJECT_DIR:\", PROJECT_DIR)" ] }, { "cell_type": "code", "execution_count": 3, "id": "5514ed91", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T05:06:26.507274Z", "iopub.status.busy": "2023-09-29T05:06:26.506786Z", "iopub.status.idle": "2023-09-29T05:06:55.991075Z", "shell.execute_reply": "2023-09-29T05:06:55.990231Z" }, "papermill": { "duration": 29.490941, "end_time": "2023-09-29T05:06:55.994238", "exception": false, "start_time": "2023-09-29T05:06:26.503297", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-29 05:06:30,625] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "---- Initializing model ----\r\n", "No of layers: 6\r\n", "Embedding size: 2048\r\n", "Output model path: ../model/v5-L6-D2048-E0_01-neox-v5base-init.pth\r\n", "Vocab size: 50277\r\n", "Emb scale: 0.01\r\n", "Note: this process takes a significant time (and ram) for large models\r\n", "---- ----- ----\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 2048 -0.01 emb.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.0.att.output.weight\r\n", "7168 2048 1.0 blocks.0.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.0.ffn.receptance.weight\r\n", "2048 7168 0 blocks.0.ffn.value.weight\r\n", "2048 2048 1.0 blocks.1.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.1.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "7168 2048 1.0 blocks.1.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.1.ffn.receptance.weight\r\n", "2048 7168 0 blocks.1.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.2.att.output.weight\r\n", "7168 2048 1.0 blocks.2.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.2.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 7168 0 blocks.2.ffn.value.weight\r\n", "2048 2048 1.0 blocks.3.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.3.att.output.weight\r\n", "7168 2048 1.0 blocks.3.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.3.ffn.receptance.weight\r\n", "2048 7168 0 blocks.3.ffn.value.weight\r\n", "2048 2048 1.0 blocks.4.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.4.att.output.weight\r\n", "7168 2048 1.0 blocks.4.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.4.ffn.receptance.weight\r\n", "2048 7168 0 blocks.4.ffn.value.weight\r\n", "2048 2048 1.0 blocks.5.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.5.att.output.weight\r\n", "7168 2048 1.0 blocks.5.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.5.ffn.receptance.weight\r\n", "2048 7168 0 blocks.5.ffn.value.weight\r\n", "50277 2048 0.5 head.weight\r\n" ] } ], "source": [ "# Init the model\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 ./init_model.py \\\n", " --n_layer {LAYER_COUNT} --n_embd {EMBED_SIZE} \\\n", " --emb-scale \"{EMBED_SCALE}\" \\\n", " --vocab_size neox --skip-if-exists \\\n", " \"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\"" ] }, { "cell_type": "markdown", "id": "8afd9e50", "metadata": { "papermill": { "duration": 0.005752, "end_time": "2023-09-29T05:06:56.006385", "exception": false, "start_time": "2023-09-29T05:06:56.000633", "status": "completed" }, "tags": [] }, "source": [ "## Enwiki Stage 1 : Foundation 4k model training" ] }, { "cell_type": "code", "execution_count": 4, "id": "ff78d2bd", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T05:06:56.020959Z", "iopub.status.busy": "2023-09-29T05:06:56.020447Z", "iopub.status.idle": "2023-09-29T05:07:01.579575Z", "shell.execute_reply": "2023-09-29T05:07:01.578476Z" }, "papermill": { "duration": 5.569483, "end_time": "2023-09-29T05:07:01.582319", "exception": false, "start_time": "2023-09-29T05:06:56.012836", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Traceback (most recent call last):\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/preload_datapath.py\", line 20, in \r\n", " assert os.path.exists(config_file), \"Config file does not exist\"\r\n", "AssertionError: Config file does not exist\r\n" ] } ], "source": [ "# Lets preload the requried dataset \n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 preload_datapath.py \"{NOTEBOOK_DIR}/v5base-enwiki-4k-part1.yaml\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "f656d56b", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T05:07:01.598719Z", "iopub.status.busy": "2023-09-29T05:07:01.597947Z", "iopub.status.idle": "2023-09-29T05:07:01.851778Z", "shell.execute_reply": "2023-09-29T05:07:01.850738Z" }, "papermill": { "duration": 0.265316, "end_time": "2023-09-29T05:07:01.854564", "exception": false, "start_time": "2023-09-29T05:07:01.589248", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/usr/bin/sh: 1: python: not found\r\n" ] } ], "source": [ "# Start the foundation model training\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", " python lightning_trainer.py fit \\\n", " -c \"{NOTEBOOK_DIR}/v5base-enwiki-4k-part1.yaml\" \\\n", " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", " --trainer.devices=\"{GPU_DEVICES}\" \\\n", " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/\" \\\n", " --model.load_model=\"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\" \\\n", " --model.ctx_len=4096 \\\n", " --model.bptt_learning_range=1" ] }, { "cell_type": "code", "execution_count": 6, "id": "c7b46f94", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T05:07:01.871225Z", "iopub.status.busy": "2023-09-29T05:07:01.870345Z", "iopub.status.idle": "2023-09-29T05:07:02.373808Z", "shell.execute_reply": "2023-09-29T05:07:02.372753Z" }, "papermill": { "duration": 0.51526, "end_time": "2023-09-29T05:07:02.376685", "exception": false, "start_time": "2023-09-29T05:07:01.861425", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/usr/bin/sh: 1: python: not found\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ls: cannot access '../model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth': No such file or directory\r\n" ] } ], "source": [ "# Lets export the model from the checkpoint\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"bf16\"\n", "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "9f558c57", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T05:07:02.393471Z", "iopub.status.busy": "2023-09-29T05:07:02.392695Z", "iopub.status.idle": "2023-09-29T05:07:08.804315Z", "shell.execute_reply": "2023-09-29T05:07:08.803244Z" }, "papermill": { "duration": 6.42299, "end_time": "2023-09-29T05:07:08.806769", "exception": false, "start_time": "2023-09-29T05:07:02.383779", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-29 05:07:06,749] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "Traceback (most recent call last):\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in \r\n", " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n", " self.model = RWKV(**model_config)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n", " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n", "ValueError: load_model file '../model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth' does not exist\r\n" ] } ], "source": [ "# # Lets do a quick dragon prompt validation\n", "!cd \"{INFERENCE_DIR}\" && \\\n", " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"cuda fp32\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "papermill": { "default_parameters": {}, "duration": 44.644446, "end_time": "2023-09-29T05:07:09.133994", "environment_variables": {}, "exception": null, "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb", "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb", "parameters": {}, "start_time": "2023-09-29T05:06:24.489548", "version": "2.4.0" } }, "nbformat": 4, "nbformat_minor": 5 }