[GHA] experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb result notebook & reports

#165
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb ADDED
@@ -0,0 +1,685 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "id": "f5ebb021",
7
+ "metadata": {
8
+ "papermill": {
9
+ "duration": 0.002819,
10
+ "end_time": "2023-09-29T04:50:11.669839",
11
+ "exception": false,
12
+ "start_time": "2023-09-29T04:50:11.667020",
13
+ "status": "completed"
14
+ },
15
+ "tags": []
16
+ },
17
+ "source": [
18
+ "# RWKV v5 multi-size training experiment\n",
19
+ "\n",
20
+ "**Note:** This project assumes you have the rwkv-infctx conda env setup"
21
+ ]
22
+ },
23
+ {
24
+ "attachments": {},
25
+ "cell_type": "markdown",
26
+ "id": "6e6abc3f",
27
+ "metadata": {
28
+ "papermill": {
29
+ "duration": 0.00214,
30
+ "end_time": "2023-09-29T04:50:11.676239",
31
+ "exception": false,
32
+ "start_time": "2023-09-29T04:50:11.674099",
33
+ "status": "completed"
34
+ },
35
+ "tags": []
36
+ },
37
+ "source": [
38
+ "# Basic Setup"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 1,
44
+ "id": "3d2405bd",
45
+ "metadata": {
46
+ "execution": {
47
+ "iopub.execute_input": "2023-09-29T04:50:11.682830Z",
48
+ "iopub.status.busy": "2023-09-29T04:50:11.682139Z",
49
+ "iopub.status.idle": "2023-09-29T04:50:12.432460Z",
50
+ "shell.execute_reply": "2023-09-29T04:50:12.431486Z"
51
+ },
52
+ "papermill": {
53
+ "duration": 0.756299,
54
+ "end_time": "2023-09-29T04:50:12.434815",
55
+ "exception": false,
56
+ "start_time": "2023-09-29T04:50:11.678516",
57
+ "status": "completed"
58
+ },
59
+ "tags": []
60
+ },
61
+ "outputs": [],
62
+ "source": [
63
+ "# First lets setup the various directories, and init the model\n",
64
+ "!mkdir -p ../../../../model/\n",
65
+ "!mkdir -p ../../../../datapath/\n",
66
+ "!mkdir -p ../../../../checkpoint/"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 2,
72
+ "id": "66fd5201",
73
+ "metadata": {
74
+ "execution": {
75
+ "iopub.execute_input": "2023-09-29T04:50:12.446546Z",
76
+ "iopub.status.busy": "2023-09-29T04:50:12.446098Z",
77
+ "iopub.status.idle": "2023-09-29T04:50:12.454394Z",
78
+ "shell.execute_reply": "2023-09-29T04:50:12.453644Z"
79
+ },
80
+ "papermill": {
81
+ "duration": 0.018125,
82
+ "end_time": "2023-09-29T04:50:12.456177",
83
+ "exception": false,
84
+ "start_time": "2023-09-29T04:50:12.438052",
85
+ "status": "completed"
86
+ },
87
+ "tags": []
88
+ },
89
+ "outputs": [
90
+ {
91
+ "name": "stdout",
92
+ "output_type": "stream",
93
+ "text": [
94
+ "DEEPSPEED_STRAT: deepspeed_stage_1\n",
95
+ "ENABLE_WANDB: True\n",
96
+ "GPU_DEVICES: auto\n",
97
+ "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train\n",
98
+ "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n",
99
+ "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n",
100
+ "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n"
101
+ ]
102
+ }
103
+ ],
104
+ "source": [
105
+ "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n",
106
+ "GPU_DEVICES=\"auto\"\n",
107
+ "ENABLE_WANDB=True\n",
108
+ "\n",
109
+ "EMBED_SCALE=0.01\n",
110
+ "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n",
111
+ "\n",
112
+ "LAYER_COUNT=6\n",
113
+ "EMBED_SIZE=2048\n",
114
+ "\n",
115
+ "WANDB_PREFIX=f\"[Multi-size] v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE}\"\n",
116
+ "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}\"\n",
117
+ "\n",
118
+ "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n",
119
+ "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
120
+ "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
121
+ "\n",
122
+ "if ENABLE_WANDB:\n",
123
+ " WANDB_MODE=\"online\"\n",
124
+ "else:\n",
125
+ " WANDB_MODE=\"disabled\"\n",
126
+ "\n",
127
+ "# Computing the notebook, and various paths\n",
128
+ "import os\n",
129
+ "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
130
+ "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n",
131
+ "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
132
+ "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
133
+ "\n",
134
+ "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
135
+ "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n",
136
+ "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
137
+ "print(\"PROJECT_DIR:\", PROJECT_DIR)"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": 3,
143
+ "id": "e0b56789",
144
+ "metadata": {
145
+ "execution": {
146
+ "iopub.execute_input": "2023-09-29T04:50:12.464627Z",
147
+ "iopub.status.busy": "2023-09-29T04:50:12.464037Z",
148
+ "iopub.status.idle": "2023-09-29T04:50:42.488005Z",
149
+ "shell.execute_reply": "2023-09-29T04:50:42.486665Z"
150
+ },
151
+ "papermill": {
152
+ "duration": 30.031629,
153
+ "end_time": "2023-09-29T04:50:42.490859",
154
+ "exception": false,
155
+ "start_time": "2023-09-29T04:50:12.459230",
156
+ "status": "completed"
157
+ },
158
+ "tags": []
159
+ },
160
+ "outputs": [
161
+ {
162
+ "name": "stdout",
163
+ "output_type": "stream",
164
+ "text": [
165
+ "[2023-09-29 04:50:16,856] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
166
+ ]
167
+ },
168
+ {
169
+ "name": "stdout",
170
+ "output_type": "stream",
171
+ "text": [
172
+ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
173
+ "---- Initializing model ----\r\n",
174
+ "No of layers: 6\r\n",
175
+ "Embedding size: 2048\r\n",
176
+ "Output model path: ../model/v5-L6-D2048-E0_01-neox-v5base-init.pth\r\n",
177
+ "Vocab size: 50277\r\n",
178
+ "Emb scale: 0.01\r\n",
179
+ "Note: this process takes a significant time (and ram) for large models\r\n",
180
+ "---- ----- ----\r\n"
181
+ ]
182
+ },
183
+ {
184
+ "name": "stdout",
185
+ "output_type": "stream",
186
+ "text": [
187
+ "50277 2048 -0.01 emb.weight\r\n"
188
+ ]
189
+ },
190
+ {
191
+ "name": "stdout",
192
+ "output_type": "stream",
193
+ "text": [
194
+ "2048 2048 1.0 blocks.0.att.gate.weight\r\n"
195
+ ]
196
+ },
197
+ {
198
+ "name": "stdout",
199
+ "output_type": "stream",
200
+ "text": [
201
+ "2048 2048 1.0 blocks.0.att.receptance.weight\r\n"
202
+ ]
203
+ },
204
+ {
205
+ "name": "stdout",
206
+ "output_type": "stream",
207
+ "text": [
208
+ "2048 2048 1.0 blocks.0.att.key.weight\r\n"
209
+ ]
210
+ },
211
+ {
212
+ "name": "stdout",
213
+ "output_type": "stream",
214
+ "text": [
215
+ "2048 2048 1.0 blocks.0.att.value.weight\r\n"
216
+ ]
217
+ },
218
+ {
219
+ "name": "stdout",
220
+ "output_type": "stream",
221
+ "text": [
222
+ "2048 2048 0 blocks.0.att.output.weight\r\n",
223
+ "7168 2048 1.0 blocks.0.ffn.key.weight\r\n"
224
+ ]
225
+ },
226
+ {
227
+ "name": "stdout",
228
+ "output_type": "stream",
229
+ "text": [
230
+ "2048 2048 0 blocks.0.ffn.receptance.weight\r\n",
231
+ "2048 7168 0 blocks.0.ffn.value.weight\r\n"
232
+ ]
233
+ },
234
+ {
235
+ "name": "stdout",
236
+ "output_type": "stream",
237
+ "text": [
238
+ "2048 2048 1.0 blocks.1.att.gate.weight\r\n"
239
+ ]
240
+ },
241
+ {
242
+ "name": "stdout",
243
+ "output_type": "stream",
244
+ "text": [
245
+ "2048 2048 1.0 blocks.1.att.receptance.weight\r\n"
246
+ ]
247
+ },
248
+ {
249
+ "name": "stdout",
250
+ "output_type": "stream",
251
+ "text": [
252
+ "2048 2048 1.0 blocks.1.att.key.weight\r\n"
253
+ ]
254
+ },
255
+ {
256
+ "name": "stdout",
257
+ "output_type": "stream",
258
+ "text": [
259
+ "2048 2048 1.0 blocks.1.att.value.weight\r\n"
260
+ ]
261
+ },
262
+ {
263
+ "name": "stdout",
264
+ "output_type": "stream",
265
+ "text": [
266
+ "2048 2048 0 blocks.1.att.output.weight\r\n",
267
+ "7168 2048 1.0 blocks.1.ffn.key.weight\r\n"
268
+ ]
269
+ },
270
+ {
271
+ "name": "stdout",
272
+ "output_type": "stream",
273
+ "text": [
274
+ "2048 2048 0 blocks.1.ffn.receptance.weight\r\n",
275
+ "2048 7168 0 blocks.1.ffn.value.weight\r\n",
276
+ "2048 2048 1.0 blocks.2.att.gate.weight\r\n"
277
+ ]
278
+ },
279
+ {
280
+ "name": "stdout",
281
+ "output_type": "stream",
282
+ "text": [
283
+ "2048 2048 1.0 blocks.2.att.receptance.weight\r\n"
284
+ ]
285
+ },
286
+ {
287
+ "name": "stdout",
288
+ "output_type": "stream",
289
+ "text": [
290
+ "2048 2048 1.0 blocks.2.att.key.weight\r\n"
291
+ ]
292
+ },
293
+ {
294
+ "name": "stdout",
295
+ "output_type": "stream",
296
+ "text": [
297
+ "2048 2048 1.0 blocks.2.att.value.weight\r\n"
298
+ ]
299
+ },
300
+ {
301
+ "name": "stdout",
302
+ "output_type": "stream",
303
+ "text": [
304
+ "2048 2048 0 blocks.2.att.output.weight\r\n"
305
+ ]
306
+ },
307
+ {
308
+ "name": "stdout",
309
+ "output_type": "stream",
310
+ "text": [
311
+ "7168 2048 1.0 blocks.2.ffn.key.weight\r\n"
312
+ ]
313
+ },
314
+ {
315
+ "name": "stdout",
316
+ "output_type": "stream",
317
+ "text": [
318
+ "2048 2048 0 blocks.2.ffn.receptance.weight\r\n"
319
+ ]
320
+ },
321
+ {
322
+ "name": "stdout",
323
+ "output_type": "stream",
324
+ "text": [
325
+ "2048 7168 0 blocks.2.ffn.value.weight\r\n",
326
+ "2048 2048 1.0 blocks.3.att.gate.weight\r\n"
327
+ ]
328
+ },
329
+ {
330
+ "name": "stdout",
331
+ "output_type": "stream",
332
+ "text": [
333
+ "2048 2048 1.0 blocks.3.att.receptance.weight\r\n"
334
+ ]
335
+ },
336
+ {
337
+ "name": "stdout",
338
+ "output_type": "stream",
339
+ "text": [
340
+ "2048 2048 1.0 blocks.3.att.key.weight\r\n"
341
+ ]
342
+ },
343
+ {
344
+ "name": "stdout",
345
+ "output_type": "stream",
346
+ "text": [
347
+ "2048 2048 1.0 blocks.3.att.value.weight\r\n"
348
+ ]
349
+ },
350
+ {
351
+ "name": "stdout",
352
+ "output_type": "stream",
353
+ "text": [
354
+ "2048 2048 0 blocks.3.att.output.weight\r\n",
355
+ "7168 2048 1.0 blocks.3.ffn.key.weight\r\n"
356
+ ]
357
+ },
358
+ {
359
+ "name": "stdout",
360
+ "output_type": "stream",
361
+ "text": [
362
+ "2048 2048 0 blocks.3.ffn.receptance.weight\r\n",
363
+ "2048 7168 0 blocks.3.ffn.value.weight\r\n"
364
+ ]
365
+ },
366
+ {
367
+ "name": "stdout",
368
+ "output_type": "stream",
369
+ "text": [
370
+ "2048 2048 1.0 blocks.4.att.gate.weight\r\n"
371
+ ]
372
+ },
373
+ {
374
+ "name": "stdout",
375
+ "output_type": "stream",
376
+ "text": [
377
+ "2048 2048 1.0 blocks.4.att.receptance.weight\r\n"
378
+ ]
379
+ },
380
+ {
381
+ "name": "stdout",
382
+ "output_type": "stream",
383
+ "text": [
384
+ "2048 2048 1.0 blocks.4.att.key.weight\r\n"
385
+ ]
386
+ },
387
+ {
388
+ "name": "stdout",
389
+ "output_type": "stream",
390
+ "text": [
391
+ "2048 2048 1.0 blocks.4.att.value.weight\r\n"
392
+ ]
393
+ },
394
+ {
395
+ "name": "stdout",
396
+ "output_type": "stream",
397
+ "text": [
398
+ "2048 2048 0 blocks.4.att.output.weight\r\n",
399
+ "7168 2048 1.0 blocks.4.ffn.key.weight\r\n"
400
+ ]
401
+ },
402
+ {
403
+ "name": "stdout",
404
+ "output_type": "stream",
405
+ "text": [
406
+ "2048 2048 0 blocks.4.ffn.receptance.weight\r\n",
407
+ "2048 7168 0 blocks.4.ffn.value.weight\r\n"
408
+ ]
409
+ },
410
+ {
411
+ "name": "stdout",
412
+ "output_type": "stream",
413
+ "text": [
414
+ "2048 2048 1.0 blocks.5.att.gate.weight\r\n"
415
+ ]
416
+ },
417
+ {
418
+ "name": "stdout",
419
+ "output_type": "stream",
420
+ "text": [
421
+ "2048 2048 1.0 blocks.5.att.receptance.weight\r\n"
422
+ ]
423
+ },
424
+ {
425
+ "name": "stdout",
426
+ "output_type": "stream",
427
+ "text": [
428
+ "2048 2048 1.0 blocks.5.att.key.weight\r\n"
429
+ ]
430
+ },
431
+ {
432
+ "name": "stdout",
433
+ "output_type": "stream",
434
+ "text": [
435
+ "2048 2048 1.0 blocks.5.att.value.weight\r\n"
436
+ ]
437
+ },
438
+ {
439
+ "name": "stdout",
440
+ "output_type": "stream",
441
+ "text": [
442
+ "2048 2048 0 blocks.5.att.output.weight\r\n"
443
+ ]
444
+ },
445
+ {
446
+ "name": "stdout",
447
+ "output_type": "stream",
448
+ "text": [
449
+ "7168 2048 1.0 blocks.5.ffn.key.weight\r\n"
450
+ ]
451
+ },
452
+ {
453
+ "name": "stdout",
454
+ "output_type": "stream",
455
+ "text": [
456
+ "2048 2048 0 blocks.5.ffn.receptance.weight\r\n",
457
+ "2048 7168 0 blocks.5.ffn.value.weight\r\n",
458
+ "50277 2048 0.5 head.weight\r\n"
459
+ ]
460
+ }
461
+ ],
462
+ "source": [
463
+ "# Init the model\n",
464
+ "!cd \"{TRAINER_DIR}\" && \\\n",
465
+ " python3 ./init_model.py \\\n",
466
+ " --n_layer {LAYER_COUNT} --n_embd {EMBED_SIZE} \\\n",
467
+ " --emb-scale \"{EMBED_SCALE}\" \\\n",
468
+ " --vocab_size neox --skip-if-exists \\\n",
469
+ " \"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\""
470
+ ]
471
+ },
472
+ {
473
+ "cell_type": "markdown",
474
+ "id": "e3057f8b",
475
+ "metadata": {
476
+ "papermill": {
477
+ "duration": 0.006306,
478
+ "end_time": "2023-09-29T04:50:42.503924",
479
+ "exception": false,
480
+ "start_time": "2023-09-29T04:50:42.497618",
481
+ "status": "completed"
482
+ },
483
+ "tags": []
484
+ },
485
+ "source": [
486
+ "## Enwiki Stage 1 : Foundation 4k model training"
487
+ ]
488
+ },
489
+ {
490
+ "cell_type": "code",
491
+ "execution_count": 4,
492
+ "id": "c06c6ad2",
493
+ "metadata": {
494
+ "execution": {
495
+ "iopub.execute_input": "2023-09-29T04:50:42.519776Z",
496
+ "iopub.status.busy": "2023-09-29T04:50:42.518734Z",
497
+ "iopub.status.idle": "2023-09-29T04:50:47.537394Z",
498
+ "shell.execute_reply": "2023-09-29T04:50:47.535673Z"
499
+ },
500
+ "papermill": {
501
+ "duration": 5.029265,
502
+ "end_time": "2023-09-29T04:50:47.539698",
503
+ "exception": false,
504
+ "start_time": "2023-09-29T04:50:42.510433",
505
+ "status": "completed"
506
+ },
507
+ "tags": []
508
+ },
509
+ "outputs": [
510
+ {
511
+ "name": "stdout",
512
+ "output_type": "stream",
513
+ "text": [
514
+ "Traceback (most recent call last):\r\n",
515
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/preload_datapath.py\", line 20, in <module>\r\n",
516
+ " assert os.path.exists(config_file), \"Config file does not exist\"\r\n",
517
+ "AssertionError: Config file does not exist\r\n"
518
+ ]
519
+ }
520
+ ],
521
+ "source": [
522
+ "# Lets preload the requried dataset \n",
523
+ "!cd \"{TRAINER_DIR}\" && \\\n",
524
+ " python3 preload_datapath.py \"{NOTEBOOK_DIR}/v5base-enwiki-4k-part1.yaml\""
525
+ ]
526
+ },
527
+ {
528
+ "cell_type": "code",
529
+ "execution_count": 5,
530
+ "id": "4cc7e34f",
531
+ "metadata": {
532
+ "execution": {
533
+ "iopub.execute_input": "2023-09-29T04:50:47.552392Z",
534
+ "iopub.status.busy": "2023-09-29T04:50:47.551853Z",
535
+ "iopub.status.idle": "2023-09-29T04:50:47.806392Z",
536
+ "shell.execute_reply": "2023-09-29T04:50:47.805379Z"
537
+ },
538
+ "papermill": {
539
+ "duration": 0.264553,
540
+ "end_time": "2023-09-29T04:50:47.809133",
541
+ "exception": false,
542
+ "start_time": "2023-09-29T04:50:47.544580",
543
+ "status": "completed"
544
+ },
545
+ "tags": []
546
+ },
547
+ "outputs": [
548
+ {
549
+ "name": "stdout",
550
+ "output_type": "stream",
551
+ "text": [
552
+ "/usr/bin/sh: 1: cd: can't cd to {TRAINER_DIR}\r\n"
553
+ ]
554
+ }
555
+ ],
556
+ "source": [
557
+ "# Start the foundation model training\n",
558
+ "!cd \"{TRAINER_DIR}\" && \\\n",
559
+ " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
560
+ " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
561
+ " python lightning_trainer.py fit \\\n",
562
+ " -c \"{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml\" \\\n",
563
+ " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
564
+ " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
565
+ " --trainer.devices=\"{GPU_DEVICES}\" \\\n",
566
+ " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/\" \\\n",
567
+ " --model.load_model=\"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\" \\\n",
568
+ " --model.ctx_len=4096 \\\n",
569
+ " --model.bptt_learning_range=1"
570
+ ]
571
+ },
572
+ {
573
+ "cell_type": "code",
574
+ "execution_count": 6,
575
+ "id": "0b3b8134",
576
+ "metadata": {
577
+ "execution": {
578
+ "iopub.execute_input": "2023-09-29T04:50:47.825099Z",
579
+ "iopub.status.busy": "2023-09-29T04:50:47.824495Z",
580
+ "iopub.status.idle": "2023-09-29T04:50:48.327589Z",
581
+ "shell.execute_reply": "2023-09-29T04:50:48.326466Z"
582
+ },
583
+ "papermill": {
584
+ "duration": 0.514109,
585
+ "end_time": "2023-09-29T04:50:48.330177",
586
+ "exception": false,
587
+ "start_time": "2023-09-29T04:50:47.816068",
588
+ "status": "completed"
589
+ },
590
+ "tags": []
591
+ },
592
+ "outputs": [
593
+ {
594
+ "name": "stdout",
595
+ "output_type": "stream",
596
+ "text": [
597
+ "/usr/bin/sh: 1: python: not found\r\n"
598
+ ]
599
+ },
600
+ {
601
+ "name": "stdout",
602
+ "output_type": "stream",
603
+ "text": [
604
+ "ls: cannot access '../model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth': No such file or directory\r\n"
605
+ ]
606
+ }
607
+ ],
608
+ "source": [
609
+ "# Lets export the model from the checkpoint\n",
610
+ "!cd \"{TRAINER_DIR}\" && \\\n",
611
+ " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"bf16\"\n",
612
+ "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\""
613
+ ]
614
+ },
615
+ {
616
+ "cell_type": "code",
617
+ "execution_count": 7,
618
+ "id": "92869fb9",
619
+ "metadata": {
620
+ "execution": {
621
+ "iopub.execute_input": "2023-09-29T04:50:48.346924Z",
622
+ "iopub.status.busy": "2023-09-29T04:50:48.346311Z",
623
+ "iopub.status.idle": "2023-09-29T04:50:48.600443Z",
624
+ "shell.execute_reply": "2023-09-29T04:50:48.599423Z"
625
+ },
626
+ "papermill": {
627
+ "duration": 0.26565,
628
+ "end_time": "2023-09-29T04:50:48.603118",
629
+ "exception": false,
630
+ "start_time": "2023-09-29T04:50:48.337468",
631
+ "status": "completed"
632
+ },
633
+ "tags": []
634
+ },
635
+ "outputs": [
636
+ {
637
+ "name": "stdout",
638
+ "output_type": "stream",
639
+ "text": [
640
+ "/usr/bin/sh: 1: cd: can't cd to {INFERENCE_DIR}\r\n"
641
+ ]
642
+ }
643
+ ],
644
+ "source": [
645
+ "# # Lets do a quick dragon prompt validation\n",
646
+ "!cd \"{INFERENCE_DIR}\" && \\\n",
647
+ " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
648
+ " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"cuda fp32\""
649
+ ]
650
+ }
651
+ ],
652
+ "metadata": {
653
+ "kernelspec": {
654
+ "display_name": "Python 3 (ipykernel)",
655
+ "language": "python",
656
+ "name": "python3"
657
+ },
658
+ "language_info": {
659
+ "codemirror_mode": {
660
+ "name": "ipython",
661
+ "version": 3
662
+ },
663
+ "file_extension": ".py",
664
+ "mimetype": "text/x-python",
665
+ "name": "python",
666
+ "nbconvert_exporter": "python",
667
+ "pygments_lexer": "ipython3",
668
+ "version": "3.10.12"
669
+ },
670
+ "papermill": {
671
+ "default_parameters": {},
672
+ "duration": 38.643439,
673
+ "end_time": "2023-09-29T04:50:49.032467",
674
+ "environment_variables": {},
675
+ "exception": null,
676
+ "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb",
677
+ "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb",
678
+ "parameters": {},
679
+ "start_time": "2023-09-29T04:50:10.389028",
680
+ "version": "2.4.0"
681
+ }
682
+ },
683
+ "nbformat": 4,
684
+ "nbformat_minor": 5
685
+ }