diff --git "a/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb" "b/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb" --- "a/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb" +++ "b/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb" @@ -3,13 +3,13 @@ { "attachments": {}, "cell_type": "markdown", - "id": "403604b7", + "id": "b33301f1", "metadata": { "papermill": { - "duration": 0.002912, - "end_time": "2023-09-29T05:56:34.178431", + "duration": 0.002639, + "end_time": "2023-09-29T06:40:35.355878", "exception": false, - "start_time": "2023-09-29T05:56:34.175519", + "start_time": "2023-09-29T06:40:35.353239", "status": "completed" }, "tags": [] @@ -23,13 +23,13 @@ { "attachments": {}, "cell_type": "markdown", - "id": "3a120d89", + "id": "7f0c7442", "metadata": { "papermill": { - "duration": 0.002318, - "end_time": "2023-09-29T05:56:34.184361", + "duration": 0.001897, + "end_time": "2023-09-29T06:40:35.361675", "exception": false, - "start_time": "2023-09-29T05:56:34.182043", + "start_time": "2023-09-29T06:40:35.359778", "status": "completed" }, "tags": [] @@ -41,19 +41,19 @@ { "cell_type": "code", "execution_count": 1, - "id": "8949fa01", + "id": "5697b559", "metadata": { "execution": { - "iopub.execute_input": "2023-09-29T05:56:34.188493Z", - "iopub.status.busy": "2023-09-29T05:56:34.188124Z", - "iopub.status.idle": "2023-09-29T05:56:34.854422Z", - "shell.execute_reply": "2023-09-29T05:56:34.853652Z" + "iopub.execute_input": "2023-09-29T06:40:35.367571Z", + "iopub.status.busy": "2023-09-29T06:40:35.367056Z", + "iopub.status.idle": "2023-09-29T06:40:36.120936Z", + "shell.execute_reply": "2023-09-29T06:40:36.120020Z" }, "papermill": { - "duration": 0.670486, - "end_time": "2023-09-29T05:56:34.856364", + "duration": 0.75911, + "end_time": "2023-09-29T06:40:36.122941", "exception": false, - "start_time": "2023-09-29T05:56:34.185878", + "start_time": "2023-09-29T06:40:35.363831", "status": "completed" }, "tags": [] @@ -69,19 +69,19 @@ { "cell_type": "code", "execution_count": 2, - "id": "dc84d15b", + "id": "a6de4e7e", "metadata": { "execution": { - "iopub.execute_input": "2023-09-29T05:56:34.862583Z", - "iopub.status.busy": "2023-09-29T05:56:34.862335Z", - "iopub.status.idle": "2023-09-29T05:56:34.870376Z", - "shell.execute_reply": "2023-09-29T05:56:34.869641Z" + "iopub.execute_input": "2023-09-29T06:40:36.129456Z", + "iopub.status.busy": "2023-09-29T06:40:36.128952Z", + "iopub.status.idle": "2023-09-29T06:40:36.137224Z", + "shell.execute_reply": "2023-09-29T06:40:36.136398Z" }, "papermill": { - "duration": 0.012677, - "end_time": "2023-09-29T05:56:34.871741", + "duration": 0.013484, + "end_time": "2023-09-29T06:40:36.138880", "exception": false, - "start_time": "2023-09-29T05:56:34.859064", + "start_time": "2023-09-29T06:40:36.125396", "status": "completed" }, "tags": [] @@ -140,19 +140,19 @@ { "cell_type": "code", "execution_count": 3, - "id": "3e8278e4", + "id": "22f457d9", "metadata": { "execution": { - "iopub.execute_input": "2023-09-29T05:56:34.877052Z", - "iopub.status.busy": "2023-09-29T05:56:34.876592Z", - "iopub.status.idle": "2023-09-29T05:57:04.417944Z", - "shell.execute_reply": "2023-09-29T05:57:04.417221Z" + "iopub.execute_input": "2023-09-29T06:40:36.145747Z", + "iopub.status.busy": "2023-09-29T06:40:36.145217Z", + "iopub.status.idle": "2023-09-29T06:41:21.917242Z", + "shell.execute_reply": "2023-09-29T06:41:21.916099Z" }, "papermill": { - "duration": 29.546096, - "end_time": "2023-09-29T05:57:04.419963", + "duration": 45.777858, + "end_time": "2023-09-29T06:41:21.919315", "exception": false, - "start_time": "2023-09-29T05:56:34.873867", + "start_time": "2023-09-29T06:40:36.141457", "status": "completed" }, "tags": [] @@ -162,7 +162,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[2023-09-29 05:56:37,806] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + "[2023-09-29 06:40:40,584] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { @@ -263,7 +263,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "2048 2048 0 blocks.1.att.output.weight\r\n", + "2048 2048 0 blocks.1.att.output.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "7168 2048 1.0 blocks.1.ffn.key.weight\r\n" ] }, @@ -322,13 +328,7 @@ "output_type": "stream", "text": [ "2048 2048 0 blocks.2.ffn.receptance.weight\r\n", - "2048 7168 0 blocks.2.ffn.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "2048 7168 0 blocks.2.ffn.value.weight\r\n", "2048 2048 1.0 blocks.3.att.gate.weight\r\n" ] }, @@ -460,7 +460,13 @@ "output_type": "stream", "text": [ "2048 2048 0 blocks.5.ffn.receptance.weight\r\n", - "2048 7168 0 blocks.5.ffn.value.weight\r\n", + "2048 7168 0 blocks.5.ffn.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "2048 2048 1.0 blocks.6.att.gate.weight\r\n" ] }, @@ -542,7 +548,13 @@ "output_type": "stream", "text": [ "2048 2048 0 blocks.7.ffn.receptance.weight\r\n", - "2048 7168 0 blocks.7.ffn.value.weight\r\n", + "2048 7168 0 blocks.7.ffn.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "2048 2048 1.0 blocks.8.att.gate.weight\r\n" ] }, @@ -668,7 +680,13 @@ "output_type": "stream", "text": [ "2048 2048 0 blocks.10.ffn.receptance.weight\r\n", - "2048 7168 0 blocks.10.ffn.value.weight\r\n", + "2048 7168 0 blocks.10.ffn.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "2048 2048 1.0 blocks.11.att.gate.weight\r\n" ] }, @@ -706,7 +724,13 @@ "output_type": "stream", "text": [ "2048 2048 0 blocks.11.ffn.receptance.weight\r\n", - "2048 7168 0 blocks.11.ffn.value.weight\r\n", + "2048 7168 0 blocks.11.ffn.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "50277 2048 0.5 head.weight\r\n" ] } @@ -723,13 +747,13 @@ }, { "cell_type": "markdown", - "id": "09f5efbe", + "id": "fde86502", "metadata": { "papermill": { - "duration": 0.004414, - "end_time": "2023-09-29T05:57:04.432709", + "duration": 0.006163, + "end_time": "2023-09-29T06:41:21.931885", "exception": false, - "start_time": "2023-09-29T05:57:04.428295", + "start_time": "2023-09-29T06:41:21.925722", "status": "completed" }, "tags": [] @@ -741,19 +765,19 @@ { "cell_type": "code", "execution_count": 4, - "id": "0bc8ec73", + "id": "a862f05f", "metadata": { "execution": { - "iopub.execute_input": "2023-09-29T05:57:04.441904Z", - "iopub.status.busy": "2023-09-29T05:57:04.441548Z", - "iopub.status.idle": "2023-09-29T06:02:22.847012Z", - "shell.execute_reply": "2023-09-29T06:02:22.846216Z" + "iopub.execute_input": "2023-09-29T06:41:21.946957Z", + "iopub.status.busy": "2023-09-29T06:41:21.946097Z", + "iopub.status.idle": "2023-09-29T06:41:33.126385Z", + "shell.execute_reply": "2023-09-29T06:41:33.125478Z" }, "papermill": { - "duration": 318.412248, - "end_time": "2023-09-29T06:02:22.848898", + "duration": 11.190379, + "end_time": "2023-09-29T06:41:33.128643", "exception": false, - "start_time": "2023-09-29T05:57:04.436650", + "start_time": "2023-09-29T06:41:21.938264", "status": "completed" }, "tags": [] @@ -764,7 +788,7 @@ "output_type": "stream", "text": [ "\r", - "Map (num_proc=16): 0%| | 0/1000000 [00:00\r\n", + " cli_main()\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n", + " LightningCLI(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 353, in __init__\r\n", + " self._run_subcommand(self.subcommand)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 642, in _run_subcommand\r\n", + " fn(**fn_kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 529, in fit\r\n", + " call._call_and_handle_interrupt(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 41, in _call_and_handle_interrupt\r\n", + " return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 91, in launch\r\n", + " return function(*args, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 568, in _fit_impl\r\n", + " self._run(model, ckpt_path=ckpt_path)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 925, in _run\r\n", + " self._data_connector.prepare_data()\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py\", line 94, in prepare_data\r\n", + " call._call_lightning_datamodule_hook(trainer, \"prepare_data\")\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 164, in _call_lightning_datamodule_hook\r\n", + " return fn(*args, **kwargs)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/data.py\", line 549, in prepare_data\r\n", + " prepare_data_static(**self._init_locals)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/data.py\", line 464, in prepare_data_static\r\n", + " src_dataset[\"train\"] = src_dataset[\"train\"].select(range(offset_val, offset_val + length_val))\r\n", + "TypeError: 'float' object cannot be interpreted as an integer\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L12-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/udijamu6\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v3\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230929_064140-udijamu6/logs\u001b[0m\r\n" ] } ], @@ -15813,7 +16360,7 @@ "# Start the foundation model training\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", - " python lightning_trainer.py fit \\\n", + " python3 lightning_trainer.py fit \\\n", " -c \"{NOTEBOOK_DIR}/enwiki-4k-part1.yaml\" \\\n", " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", @@ -15827,19 +16374,19 @@ { "cell_type": "code", "execution_count": 6, - "id": "57fc24eb", + "id": "9dcc8aa0", "metadata": { "execution": { - "iopub.execute_input": "2023-09-29T06:02:23.372270Z", - "iopub.status.busy": "2023-09-29T06:02:23.371974Z", - "iopub.status.idle": "2023-09-29T06:02:23.812570Z", - "shell.execute_reply": "2023-09-29T06:02:23.811861Z" + "iopub.execute_input": "2023-09-29T06:46:53.049378Z", + "iopub.status.busy": "2023-09-29T06:46:53.048820Z", + "iopub.status.idle": "2023-09-29T06:46:56.789000Z", + "shell.execute_reply": "2023-09-29T06:46:56.787968Z" }, "papermill": { - "duration": 0.515407, - "end_time": "2023-09-29T06:02:23.814292", + "duration": 3.854596, + "end_time": "2023-09-29T06:46:56.791829", "exception": false, - "start_time": "2023-09-29T06:02:23.298885", + "start_time": "2023-09-29T06:46:52.937233", "status": "completed" }, "tags": [] @@ -15849,7 +16396,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "/usr/bin/sh: 1: python: not found\r\n" + "[2023-09-29 06:46:55,390] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in \r\n", + " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n", + " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n", + "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L12-D2048-E0_01-enwiki-4k-p1/last.ckpt/latest\r\n" ] }, { @@ -15863,26 +16430,26 @@ "source": [ "# Lets export the model from the checkpoint\n", "!cd \"{TRAINER_DIR}\" && \\\n", - " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"bf16\"\n", + " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"bf16\"\n", "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\"" ] }, { "cell_type": "code", "execution_count": 7, - "id": "7cfa04c5", + "id": "fbac8551", "metadata": { "execution": { - "iopub.execute_input": "2023-09-29T06:02:23.964300Z", - "iopub.status.busy": "2023-09-29T06:02:23.963981Z", - "iopub.status.idle": "2023-09-29T06:02:27.911591Z", - "shell.execute_reply": "2023-09-29T06:02:27.910851Z" + "iopub.execute_input": "2023-09-29T06:46:57.029599Z", + "iopub.status.busy": "2023-09-29T06:46:57.029240Z", + "iopub.status.idle": "2023-09-29T06:47:03.411064Z", + "shell.execute_reply": "2023-09-29T06:47:03.409916Z" }, "papermill": { - "duration": 4.022282, - "end_time": "2023-09-29T06:02:27.913551", + "duration": 6.495854, + "end_time": "2023-09-29T06:47:03.413417", "exception": false, - "start_time": "2023-09-29T06:02:23.891269", + "start_time": "2023-09-29T06:46:56.917563", "status": "completed" }, "tags": [] @@ -15892,14 +16459,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "[2023-09-29 06:02:26,883] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + "[2023-09-29 06:47:01,373] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "Traceback (most recent call last):\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in \r\n", " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n", @@ -15938,14 +16511,14 @@ }, "papermill": { "default_parameters": {}, - "duration": 354.822024, - "end_time": "2023-09-29T06:02:28.105571", + "duration": 389.827856, + "end_time": "2023-09-29T06:47:03.946497", "environment_variables": {}, "exception": null, "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb", "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb", "parameters": {}, - "start_time": "2023-09-29T05:56:33.283547", + "start_time": "2023-09-29T06:40:34.118641", "version": "2.4.0" } },