{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "4c73afb6", "metadata": { "papermill": { "duration": 0.003926, "end_time": "2023-08-25T16:07:12.117562", "exception": false, "start_time": "2023-08-25T16:07:12.113636", "status": "completed" }, "tags": [] }, "source": [ "# RWKV v5-headsize2x / embedding init-range 1e-01 / 4k\n", "\n", "- 6 layers\n", "- 4096 embedding size\n", "\n", "Going through the modified memory training for v5 models, across various initial embedding model weights\n", "\n", "**Note:** This project assumes you have the rwkv-infctx conda env setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "393e4299", "metadata": { "papermill": { "duration": 0.002494, "end_time": "2023-08-25T16:07:12.122625", "exception": false, "start_time": "2023-08-25T16:07:12.120131", "status": "completed" }, "tags": [] }, "source": [ "# Basic Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "e8229f09", "metadata": { "execution": { "iopub.execute_input": "2023-08-25T16:07:12.129239Z", "iopub.status.busy": "2023-08-25T16:07:12.128559Z", "iopub.status.idle": "2023-08-25T16:07:12.847564Z", "shell.execute_reply": "2023-08-25T16:07:12.846564Z" }, "papermill": { "duration": 0.724082, "end_time": "2023-08-25T16:07:12.849402", "exception": false, "start_time": "2023-08-25T16:07:12.125320", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# First lets setup the various directories, and init the model\n", "!mkdir -p ../../../../model/\n", "!mkdir -p ../../../../datapath/\n", "!mkdir -p ../../../../checkpoint/" ] }, { "cell_type": "code", "execution_count": 2, "id": "c7a42dc0", "metadata": { "execution": { "iopub.execute_input": "2023-08-25T16:07:12.855853Z", "iopub.status.busy": "2023-08-25T16:07:12.855653Z", "iopub.status.idle": "2023-08-25T16:07:15.711591Z", "shell.execute_reply": "2023-08-25T16:07:15.710680Z" }, "papermill": { "duration": 2.861479, "end_time": "2023-08-25T16:07:15.713370", "exception": false, "start_time": "2023-08-25T16:07:12.851891", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Additional dependencies for eval stuff\n", "!pip install -q aiocsv aiofiles" ] }, }, "papermill": { "duration": 0.011194, "end_time": "2023-08-25T16:07:15.727505", "exception": false, "start_time": "2023-08-25T16:07:15.716311", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DEEPSPEED_STRAT: deepspeed_stage_1\n", "ENABLE_WANDB: True\n", "GPU_DEVICES: auto\n", "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x\n", "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x\n", "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x\n", "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" ] } ], "source": [ "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", "GPU_DEVICES=\"auto\"\n", "ENABLE_WANDB=True\n", "\n", "RWKV_WAVENET_LAYERS=1\n", "\n", "EMBED_SCALE=0.1\n", "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", "\n", "LAYER_COUNT=6\n", "EMBED_DIM=2048\n", "\n", "WANDB_PREFIX=f\"v5-hs2x-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", "FILENAME_PREFIX=f\"v5-hs2x-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", "\n", "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", "\n", "if ENABLE_WANDB:\n", " WANDB_MODE=\"online\"\n", "else:\n", " WANDB_MODE=\"disabled\"\n", "\n", "# Computing the notebook, and various paths\n", "import os\n", "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize2x/\"))\n", "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize2x/\"))\n", "\n", "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", "print(\"PROJECT_DIR:\", PROJECT_DIR)" ] }, { "cell_type": "code", "execution_count": 4, "id": "fed724db", "metadata": { "execution": { "iopub.execute_input": "2023-08-25T16:07:15.733547Z", "iopub.status.busy": "2023-08-25T16:07:15.733381Z" }, "papermill": { "duration": null, "end_time": null, "exception": false, "start_time": "2023-08-25T16:07:15.730189", "status": "running" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "---- Initializing model ----\r\n", "No of layers: 6\r\n", "Embedding size: 4096\r\n", "Output model path: ../model/L6-D2048-E0_1-neox-v5base-init.pth\r\n", "Vocab size: 50277\r\n", "Emb scale: 0.1\r\n", "Note: this process takes a significant time (and ram) for large models\r\n", "---- ----- ----\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 4096 -0.1 emb.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.0.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.0.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.0.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.0.att.output.weight\r\n", "16384 4096 1.0 blocks.0.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.0.ffn.receptance.weight\r\n", "4096 16384 0 blocks.0.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.1.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.1.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.1.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.1.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "16384 4096 1.0 blocks.1.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.1.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.1.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.2.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.2.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.2.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.2.att.output.weight\r\n", "16384 4096 1.0 blocks.2.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.2.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.2.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.3.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.3.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.3.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.3.att.output.weight\r\n", "16384 4096 1.0 blocks.3.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.3.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.3.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.4.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.4.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.4.att.value.weight\r\n" ] } ], "source": [ "# Init the model\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ./init_model.py \\\n", " --n_layer 6 --n_embd 4096 \\\n", " --emb-scale \"{EMBED_SCALE}\" \\\n", " --vocab_size neox --skip-if-exists \\\n", " \"../model/L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth\"" ] }, { "cell_type": "markdown", "id": "b17f7961", "metadata": { "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "source": [ "## Enwiki Stage 1 : Foundation 4k model training" ] }, { "cell_type": "code", "execution_count": null, "id": "0caf9040", "metadata": { "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "outputs": [], "source": [ "# Lets preload the requried dataset \n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 preload_datapath.py \"{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml\"" ] }, { "cell_type": "code", "execution_count": null, "id": "77604691", "metadata": { "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "outputs": [], "source": [ "# Start the foundation model training\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", " python lightning_trainer.py fit \\\n", " -c \"{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml\" \\\n", " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", " --trainer.devices=\"{GPU_DEVICES}\" \\\n", " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/\" \\\n", " --model.load_model=\"../model/L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth\" \\\n", " --model.ctx_len=4096 \\\n", " --model.bptt_learning_range=1" ] }, { "cell_type": "code", "execution_count": null, "id": "41df74c7", "metadata": { "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "outputs": [], "source": [ "# Lets export the model from the checkpoint\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"bf16\"\n", "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\"" ] }, { "cell_type": "code", "execution_count": null, "id": "29c9df8a", "metadata": { "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "outputs": [], "source": [ "# # Lets do a quick dragon prompt validation\n", "!cd \"{INFERENCE_DIR}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"cuda fp32\"" ] }, { "cell_type": "code", "execution_count": null, "id": "442c0b35", "metadata": { "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "outputs": [], "source": [ "# Lets do a quick memory test\n", "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth\"" ] }, { "attachments": {}, "cell_type": "markdown", "id": "43fe425a", "metadata": { "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "source": [ "# Enwiki Stage 2 : Basic Instruct Tuning" ] }, { "cell_type": "code", "execution_count": null, "id": "ad7fabe0", "metadata": { "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "outputs": [], "source": [ "# Lets preload the requried dataset\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 preload_datapath.py \"{NOTEBOOK_DIR}/v5base-enwiki-instruct.yaml\"" ] }, { "cell_type": "code", "execution_count": null, "id": "a219c41a", "metadata": { "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "outputs": [], "source": [ "# Start the instruct finetuning\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python lightning_trainer.py fit \\\n", " -c \"{NOTEBOOK_DIR}/v5base-enwiki-instruct.yaml\" \\\n", " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-Instruct (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", " --trainer.devices=\"{GPU_DEVICES}\" \\\n", " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/\" \\\n", " --model.load_model=\"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \\\n", " --model.ctx_len=4096 \\\n", " --model.bptt_learning_range=1" ] }, { "cell_type": "code", "execution_count": null, "id": "0142da06", "metadata": { "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "outputs": [], "source": [ "# Lets export the model from the checkpoint\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"bf16\"\n", "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\"" ] }, { "cell_type": "code", "execution_count": null, "id": "b6c4cda2", "metadata": { "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "outputs": [], "source": [ "# # Lets do a quick dragon prompt validation\n", "!cd \"{INFERENCE_DIR}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"cuda fp32\"" ] }, { "cell_type": "code", "execution_count": null, "id": "7f278017", "metadata": { "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "outputs": [], "source": [ "# Lets do a quick memory test\n", "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ../memory_script/eval_v5headsize2x_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-instruct.pth\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, 