diff --git "a/experiment/rwkv-x-exp/v5-headsize32/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb" "b/experiment/rwkv-x-exp/v5-headsize32/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb" new file mode 100644--- /dev/null +++ "b/experiment/rwkv-x-exp/v5-headsize32/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb" @@ -0,0 +1,115942 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "09e44335", + "metadata": { + "papermill": { + "duration": 0.003978, + "end_time": "2023-08-26T07:28:27.299314", + "exception": false, + "start_time": "2023-08-26T07:28:27.295336", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# RWKV v5-headsize2x / embedding init-range 1e-01 / 4k\n", + "\n", + "- 6 layers\n", + "- 2048 embedding size\n", + "\n", + "Going through the modified memory training for v5 models, across various initial embedding model weights\n", + "\n", + "**Note:** This project assumes you have the rwkv-infctx conda env setup" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1f1248e4", + "metadata": { + "papermill": { + "duration": 0.002314, + "end_time": "2023-08-26T07:28:27.304102", + "exception": false, + "start_time": "2023-08-26T07:28:27.301788", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Basic Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "767d263a", + "metadata": { + "execution": { + "iopub.execute_input": "2023-08-26T07:28:27.310112Z", + "iopub.status.busy": "2023-08-26T07:28:27.309934Z", + "iopub.status.idle": "2023-08-26T07:28:28.033284Z", + "shell.execute_reply": "2023-08-26T07:28:28.031917Z" + }, + "papermill": { + "duration": 0.72876, + "end_time": "2023-08-26T07:28:28.035241", + "exception": false, + "start_time": "2023-08-26T07:28:27.306481", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# First lets setup the various directories, and init the model\n", + "!mkdir -p ../../../../model/\n", + "!mkdir -p ../../../../datapath/\n", + "!mkdir -p ../../../../checkpoint/" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4997aff1", + "metadata": { + "execution": { + "iopub.execute_input": "2023-08-26T07:28:28.041248Z", + "iopub.status.busy": "2023-08-26T07:28:28.041054Z", + "iopub.status.idle": "2023-08-26T07:28:30.913840Z", + "shell.execute_reply": "2023-08-26T07:28:30.912852Z" + }, + "papermill": { + "duration": 2.877909, + "end_time": "2023-08-26T07:28:30.915604", + "exception": false, + "start_time": "2023-08-26T07:28:28.037695", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\r\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\r\n" + ] + } + ], + "source": [ + "# Additional dependencies for eval stuff\n", + "!pip install -q aiocsv aiofiles" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b2aa92e2", + "metadata": { + "execution": { + "iopub.execute_input": "2023-08-26T07:28:30.922789Z", + "iopub.status.busy": "2023-08-26T07:28:30.922578Z", + "iopub.status.idle": "2023-08-26T07:28:30.929443Z", + "shell.execute_reply": "2023-08-26T07:28:30.928523Z" + }, + "papermill": { + "duration": 0.012074, + "end_time": "2023-08-26T07:28:30.930888", + "exception": false, + "start_time": "2023-08-26T07:28:30.918814", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DEEPSPEED_STRAT: deepspeed_stage_1\n", + "ENABLE_WANDB: True\n", + "GPU_DEVICES: auto\n", + "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize32\n", + "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize32\n", + "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize32\n", + "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" + ] + } + ], + "source": [ + "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", + "GPU_DEVICES=\"auto\"\n", + "ENABLE_WANDB=True\n", + "\n", + "RWKV_WAVENET_LAYERS=1\n", + "\n", + "EMBED_SCALE=0.1\n", + "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", + "\n", + "LAYER_COUNT=6\n", + "EMBED_DIM=2048\n", + "\n", + "WANDB_PREFIX=f\"v5-hs32-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", + "FILENAME_PREFIX=f\"v5-hs32-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", + "\n", + "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", + "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", + "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", + "\n", + "if ENABLE_WANDB:\n", + " WANDB_MODE=\"online\"\n", + "else:\n", + " WANDB_MODE=\"disabled\"\n", + "\n", + "# Computing the notebook, and various paths\n", + "import os\n", + "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", + "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", + "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize32/\"))\n", + "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize32/\"))\n", + "\n", + "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", + "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", + "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", + "print(\"PROJECT_DIR:\", PROJECT_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b26927c0", + "metadata": { + "execution": { + "iopub.execute_input": "2023-08-26T07:28:30.937736Z", + "iopub.status.busy": "2023-08-26T07:28:30.937427Z", + "iopub.status.idle": "2023-08-26T07:29:01.920340Z", + "shell.execute_reply": "2023-08-26T07:29:01.919447Z" + }, + "papermill": { + "duration": 30.989011, + "end_time": "2023-08-26T07:29:01.922511", + "exception": false, + "start_time": "2023-08-26T07:28:30.933500", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting ds_accelerator to cuda (auto detect)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", + "---- Initializing model ----\r\n", + "No of layers: 6\r\n", + "Embedding size: 2048\r\n", + "Output model path: ../model/v5-hs32-L6-D2048-E0_1-neox-init.pth\r\n", + "Vocab size: 50277\r\n", + "Emb scale: 0.1\r\n", + "Note: this process takes a significant time (and ram) for large models\r\n", + "---- ----- ----\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "50277 2048 -0.1 emb.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.0.att.receptance.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.0.att.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.0.att.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.0.att.output.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8192 2048 1.0 blocks.0.ffn.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.0.ffn.receptance.weight\r\n", + "2048 8192 0 blocks.0.ffn.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.1.att.receptance.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.1.att.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.1.att.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.1.att.output.weight\r\n", + "8192 2048 1.0 blocks.1.ffn.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.1.ffn.receptance.weight\r\n", + "2048 8192 0 blocks.1.ffn.value.weight\r\n", + "2048 2048 1.0 blocks.2.att.receptance.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.2.att.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.2.att.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.2.att.output.weight\r\n", + "8192 2048 1.0 blocks.2.ffn.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.2.ffn.receptance.weight\r\n", + "2048 8192 0 blocks.2.ffn.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.3.att.receptance.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.3.att.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.3.att.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.3.att.output.weight\r\n", + "8192 2048 1.0 blocks.3.ffn.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.3.ffn.receptance.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 8192 0 blocks.3.ffn.value.weight\r\n", + "2048 2048 1.0 blocks.4.att.receptance.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.4.att.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.4.att.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.4.att.output.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8192 2048 1.0 blocks.4.ffn.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.4.ffn.receptance.weight\r\n", + "2048 8192 0 blocks.4.ffn.value.weight\r\n", + "2048 2048 1.0 blocks.5.att.receptance.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.5.att.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.5.att.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.5.att.output.weight\r\n", + "8192 2048 1.0 blocks.5.ffn.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.5.ffn.receptance.weight\r\n", + "2048 8192 0 blocks.5.ffn.value.weight\r\n", + "50277 2048 0.5 head.weight\r\n" + ] + } + ], + "source": [ + "# Init the model\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ./init_model.py \\\n", + " --n_layer \"{LAYER_COUNT}\" --n_embd \"{EMBED_DIM}\" \\\n", + " --emb-scale \"{EMBED_SCALE}\" \\\n", + " --vocab_size neox --skip-if-exists \\\n", + " \"../model/{FILENAME_PREFIX}-neox-init.pth\"" + ] + }, + { + "cell_type": "markdown", + "id": "87ade375", + "metadata": { + "papermill": { + "duration": 0.004207, + "end_time": "2023-08-26T07:29:01.931843", + "exception": false, + "start_time": "2023-08-26T07:29:01.927636", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Enwiki Stage 1 : Foundation 4k model training" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "09c3e364", + "metadata": { + "execution": { + "iopub.execute_input": "2023-08-26T07:29:01.943505Z", + "iopub.status.busy": "2023-08-26T07:29:01.942714Z", + "iopub.status.idle": "2023-08-26T07:29:12.926373Z", + "shell.execute_reply": "2023-08-26T07:29:12.925399Z" + }, + "papermill": { + "duration": 10.992521, + "end_time": "2023-08-26T07:29:12.928595", + "exception": false, + "start_time": "2023-08-26T07:29:01.936074", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found cached dataset parquet (/actions-runner/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\r\n", + "\r", + " 0%| | 0/1 [00:00=12.1), as this is known to have freeze issues\r\n", + "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", + "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", + "#\r\n", + "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\r\n", + "\r\n", + "[RWKV.model] Configuring optimizer with\r\n", + " - lr_init: 6.000e-04 (0.0006)\r\n", + " - lr_final: 4.000e-04 (0.0004)\r\n", + "\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Detected CUDA files, patching ldflags\r\n", + "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Building extension module fused_adam...\r\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", + "ninja: no work to do.\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.07028579711914062 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.10191679000854492 seconds\r\n", + "Time to load fused_adam op: 0.10213589668273926 seconds\r\n", + "Time to load fused_adam op: 0.10210657119750977 seconds\r\n", + "Time to load fused_adam op: 0.10168051719665527 seconds\r\n", + "Time to load fused_adam op: 0.10149025917053223 seconds\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.1018669605255127 seconds\r\n", + "Loading `train_dataloader` to estimate number of stepping batches.\r\n", + "Time to load fused_adam op: 0.10195755958557129 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", + "Building extension module utils...\r\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", + "ninja: no work to do.\r\n", + "Loading extension module utils...\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time to load utils op: 0.06916999816894531 seconds\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10274457931518555 seconds\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10211014747619629 seconds\r\n", + "Time to load utils op: 0.10240483283996582 seconds\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10251259803771973 seconds\r\n", + "Time to load utils op: 0.1026301383972168 seconds\r\n", + "Time to load utils op: 0.10254979133605957 seconds\r\n", + "Time to load utils op: 0.10275721549987793 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 6 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 4 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n", + "Rank: 7 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 0 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 1 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 3 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 5 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 2 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0006668567657470703 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0014417171478271484 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0007450580596923828 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Time to load utils op: 0.0005919933319091797 seconds\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Time to load utils op: 0.0006432533264160156 seconds\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0006520748138427734 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.000812530517578125 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0009579658508300781 seconds\r\n", + "\r\n", + " | Name | Type | Params\r\n", + "--------------------------------------\r\n", + "0 | emb | Embedding | 102 M \r\n", + "1 | blocks | ModuleList | 327 M \r\n", + "2 | ln_out | LayerNorm | 4.1 K \r\n", + "3 | head | Linear | 102 M \r\n", + "--------------------------------------\r\n", + "533 M Trainable params\r\n", + "0 Non-trainable params\r\n", + "533 M Total params\r\n", + "2,132.938 Total estimated model params size (MB)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Training: 0it [00:00, ?it/s]\r", + "Training: 0%| | 0/10186 [00:00\r\n", + " asyncio.run(main_function())\r\n", + " File \"/usr/lib/python3.11/asyncio/runners.py\", line 190, in run\r\n", + " return runner.run(main)\r\n", + " ^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/lib/python3.11/asyncio/runners.py\", line 118, in run\r\n", + " return self._loop.run_until_complete(task)\r\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/lib/python3.11/asyncio/base_events.py\", line 653, in run_until_complete\r\n", + " return future.result()\r\n", + " ^^^^^^^^^^^^^^^\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize32/../memory_script/eval_v5_memory_guided.py\", line 58, in main_function\r\n", + " model = SimpleRWKV(model_path, device=\"cuda\")\r\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1378, in __init__\r\n", + " self.model = RWKV(**model_config)\r\n", + " ^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 650, in __init__\r\n", + " self.load_state_dict(model_weights)\r\n", + " File \"/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\", line 2041, in load_state_dict\r\n", + " raise RuntimeError('Error(s) in loading state_dict for {}:\\n\\t{}'.format(\r\n", + "RuntimeError: Error(s) in loading state_dict for RWKV:\r\n", + "\tsize mismatch for blocks.0.att.time_decay: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.0.att.time_faaaa: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.1.att.time_decay: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.1.att.time_faaaa: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.2.att.time_decay: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.2.att.time_faaaa: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.3.att.time_decay: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.3.att.time_faaaa: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.4.att.time_decay: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.4.att.time_faaaa: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.5.att.time_decay: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.5.att.time_faaaa: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([32]).\r\n" + ] + } + ], + "source": [ + "# Lets do a quick memory test\n", + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a2881385", + "metadata": { + "papermill": { + "duration": 0.68617, + "end_time": "2023-08-26T10:19:24.118277", + "exception": false, + "start_time": "2023-08-26T10:19:23.432107", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Enwiki Stage 2 : Basic Instruct Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "67f2f796", + "metadata": { + "execution": { + "iopub.execute_input": "2023-08-26T10:19:25.254199Z", + "iopub.status.busy": "2023-08-26T10:19:25.253352Z", + "iopub.status.idle": "2023-08-26T10:19:32.769820Z", + "shell.execute_reply": "2023-08-26T10:19:32.769190Z" + }, + "papermill": { + "duration": 8.088274, + "end_time": "2023-08-26T10:19:32.771899", + "exception": false, + "start_time": "2023-08-26T10:19:24.683625", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found cached dataset parquet (/actions-runner/.cache/huggingface/datasets/c-s-ale___parquet/c-s-ale--dolly-15k-instruction-alpaca-format-9dfbb23260d63d9d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\r\n", + "\r", + " 0%| | 0/1 [00:00=12.1), as this is known to have freeze issues\r\n", + "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", + "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", + "#LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\r\n", + "\r\n", + "\r\n", + "[RWKV.model] Configuring optimizer with\r\n", + " - lr_init: 4.000e-04 (0.0004)\r\n", + " - lr_final: 3.000e-04 (0.0003)\r\n", + "\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Detected CUDA files, patching ldflags\r\n", + "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", + "Building extension module fused_adam...\r\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", + "ninja: no work to do.\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.07085037231445312 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.10151219367980957 seconds\r\n", + "Time to load fused_adam op: 0.1011807918548584 seconds\r\n", + "Time to load fused_adam op: 0.10185527801513672 seconds\r\n", + "Loading `train_dataloader` to estimate number of stepping batches.\r\n", + "Time to load fused_adam op: 0.10194730758666992 seconds\r\n", + "Time to load fused_adam op: 0.10143423080444336 seconds\r\n", + "Time to load fused_adam op: 0.10146594047546387 seconds\r\n", + "Time to load fused_adam op: 0.10157418251037598 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", + "Building extension module utils...\r\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", + "ninja: no work to do.\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0713951587677002 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10219931602478027 seconds\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10209369659423828 seconds\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10286974906921387 seconds\r\n", + "Time to load utils op: 0.1024172306060791 seconds\r\n", + "Time to load utils op: 0.10242533683776855 seconds\r\n", + "Time to load utils op: 0.10226178169250488 seconds\r\n", + "Time to load utils op: 0.10172843933105469 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 2 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 0 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 4 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 7 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 1 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 6 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 5 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 3 partition count [8, 8] and sizes[(66654208, False), (96, False)] \r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0005695819854736328 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0005481243133544922 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.000606536865234375 seconds\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.000843048095703125 seconds\r\n", + "Time to load utils op: 0.0006282329559326172 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0006814002990722656 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0010170936584472656 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0008132457733154297 seconds\r\n", + "\r\n", + " | Name | Type | Params\r\n", + "--------------------------------------\r\n", + "0 | emb | Embedding | 102 M \r\n", + "1 | blocks | ModuleList | 327 M \r\n", + "2 | ln_out | LayerNorm | 4.1 K \r\n", + "3 | head | Linear | 102 M \r\n", + "--------------------------------------\r\n", + "533 M Trainable params\r\n", + "0 Non-trainable params\r\n", + "533 M Total params\r\n", + "2,132.938 Total estimated model params size (MB)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Training: 0it [00:00, ?it/s]\r", + "Training: 0%| | 0/1867 [00:00