# RWKV v5-headsize2x / embedding init-range 1e-01 / 4k

- 6 layers
- 4096 embedding size

Going through the modified memory training for v5 models, across various initial embedding model weights

**Note:** This project assumes you have the rwkv-infctx conda env setup

# Basic Setup

In [1]:
# First lets setup the various directories, and init the model
!mkdir -p ../../../../model/
!mkdir -p ../../../../datapath/
!mkdir -p ../../../../checkpoint/

In [2]:
# Additional dependencies for eval stuff
!pip install -q aiocsv aiofiles

[0m


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [3]:
DEEPSPEED_STRAT="deepspeed_stage_1"
GPU_DEVICES="auto"
ENABLE_WANDB=True

RWKV_WAVENET_LAYERS=1

EMBED_SCALE=0.1
EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(".", "_")

LAYER_COUNT=6
EMBED_DIM=2048

WANDB_PREFIX=f"v5-hs2x-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}"
FILENAME_PREFIX=f"v5-hs2x-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}"

print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)

if ENABLE_WANDB:
 WANDB_MODE="online"
else:
 WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5headsize2x/"))
INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5headsize2x/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("INFERENCE_DIR:", INFERENCE_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

DEEPSPEED_STRAT: deepspeed_stage_1
ENABLE_WANDB: True
GPU_DEVICES: auto
NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x
INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x
TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x
PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer


In [4]:
# Init the model
!cd "{TRAINER_DIR}" && \
 export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
 python3 ./init_model.py \
 --n_layer 6 --n_embd 4096 \
 --emb-scale "{EMBED_SCALE}" \
 --vocab_size neox --skip-if-exists \
 "../model/L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth"

Setting ds_accelerator to cuda (auto detect)


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
---- Initializing model ----
No of layers: 6
Embedding size: 4096
Output model path: ../model/L6-D2048-E0_1-neox-v5base-init.pth
Vocab size: 50277
Emb scale: 0.1
Note: this process takes a significant time (and ram) for large models
---- ----- ----


50277 4096 -0.1 emb.weight


4096 4096 1.0 blocks.0.att.receptance.weight


4096 4096 1.0 blocks.0.att.key.weight


4096 4096 1.0 blocks.0.att.value.weight


4096 4096 0 blocks.0.att.output.weight
16384 4096 1.0 blocks.0.ffn.key.weight


4096 4096 0 blocks.0.ffn.receptance.weight
4096 16384 0 blocks.0.ffn.value.weight


4096 4096 1.0 blocks.1.att.receptance.weight


4096 4096 1.0 blocks.1.att.key.weight


4096 4096 1.0 blocks.1.att.value.weight


4096 4096 0 blocks.1.att.output.weight


16384 4096 1.0 blocks.1.ffn.key.weight


4096 4096 0 blocks.1.ffn.receptance.weight


4096 16384 0 blocks.1.ffn.value.weight


4096 4096 1.0 blocks.2.att.receptance.weight


4096 4096 1.0 blocks.2.att.key.weight


4096 4096 1.0 blocks.2.att.value.weight


4096 4096 0 blocks.2.att.output.weight
16384 4096 1.0 blocks.2.ffn.key.weight


4096 4096 0 blocks.2.ffn.receptance.weight


4096 16384 0 blocks.2.ffn.value.weight


4096 4096 1.0 blocks.3.att.receptance.weight


4096 4096 1.0 blocks.3.att.key.weight


4096 4096 1.0 blocks.3.att.value.weight


4096 4096 0 blocks.3.att.output.weight
16384 4096 1.0 blocks.3.ffn.key.weight


4096 4096 0 blocks.3.ffn.receptance.weight


4096 16384 0 blocks.3.ffn.value.weight


4096 4096 1.0 blocks.4.att.receptance.weight


4096 4096 1.0 blocks.4.att.key.weight


4096 4096 1.0 blocks.4.att.value.weight


## Enwiki Stage 1 : Foundation 4k model training

In [None]:
# Lets preload the requried dataset 
!cd "{TRAINER_DIR}" && \
 python3 preload_datapath.py "{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml"

In [None]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
 export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
 export WANDB_MODE="{WANDB_MODE}" && \
 python lightning_trainer.py fit \
 -c "{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml" \
 --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki-4k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})" \
 --trainer.strategy="{DEEPSPEED_STRAT}" \
 --trainer.devices="{GPU_DEVICES}" \
 --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-enwiki-4k/" \
 --model.load_model="../model/L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth" \
 --model.ctx_len=4096 \
 --model.bptt_learning_range=1

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
 python export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-enwiki-4k/last.ckpt" "../model/{FILENAME_PREFIX}-enwiki-4k.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-enwiki-4k.pth"

In [None]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
 export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
 python3 dragon_test.py "../model/{FILENAME_PREFIX}-enwiki-4k.pth" "cuda fp32"

In [None]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
 python3 ../memory_script/eval_v5_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth"

# Enwiki Stage 2 : Basic Instruct Tuning

In [None]:
# Lets preload the requried dataset
!cd "{TRAINER_DIR}" && \
 python3 preload_datapath.py "{NOTEBOOK_DIR}/v5base-enwiki-instruct.yaml"

In [None]:
# Start the instruct finetuning
!cd "{TRAINER_DIR}" && \
 export WANDB_MODE="{WANDB_MODE}" && \
 export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
 python lightning_trainer.py fit \
 -c "{NOTEBOOK_DIR}/v5base-enwiki-instruct.yaml" \
 --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki-Instruct (train-ctx=4k, {DEEPSPEED_STRAT})" \
 --trainer.strategy="{DEEPSPEED_STRAT}" \
 --trainer.devices="{GPU_DEVICES}" \
 --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/" \
 --model.load_model="../model/{FILENAME_PREFIX}-enwiki-4k.pth" \
 --model.ctx_len=4096 \
 --model.bptt_learning_range=1

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
 python export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/last.ckpt" "../model/{FILENAME_PREFIX}-enwiki-instruct.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-enwiki-instruct.pth"

In [None]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
 export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
 python3 dragon_test.py "../model/{FILENAME_PREFIX}-enwiki-instruct.pth" "cuda fp32"

In [None]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
 python3 ../memory_script/eval_v5headsize2x_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-instruct.pth"