[GHA] trainer-v4-unit-test/model-init.ipynb result notebooks

#2
by picocreator - opened
actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/trainer-v4-unit-test/trainer-v4-unit-test/model-init.ipynb ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "id": "a6359476",
7
+ "metadata": {
8
+ "papermill": {
9
+ "duration": 0.002323,
10
+ "end_time": "2023-08-23T10:26:43.282956",
11
+ "exception": false,
12
+ "start_time": "2023-08-23T10:26:43.280633",
13
+ "status": "completed"
14
+ },
15
+ "tags": []
16
+ },
17
+ "source": [
18
+ "# Model Init\n",
19
+ "\n",
20
+ "Test that the model init code, runs without issues\n",
21
+ "\n",
22
+ "**L6-D512 model with**\n",
23
+ "- Layer count: 6\n",
24
+ "- Embed size: 512"
25
+ ]
26
+ },
27
+ {
28
+ "attachments": {},
29
+ "cell_type": "markdown",
30
+ "id": "6c73e486",
31
+ "metadata": {
32
+ "notebookRunGroups": {
33
+ "groupValue": ""
34
+ },
35
+ "papermill": {
36
+ "duration": 0.001515,
37
+ "end_time": "2023-08-23T10:26:43.286466",
38
+ "exception": false,
39
+ "start_time": "2023-08-23T10:26:43.284951",
40
+ "status": "completed"
41
+ },
42
+ "tags": []
43
+ },
44
+ "source": [
45
+ "## Preparing the init model and test dataset"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 1,
51
+ "id": "fcface89",
52
+ "metadata": {
53
+ "execution": {
54
+ "iopub.execute_input": "2023-08-23T10:26:43.291673Z",
55
+ "iopub.status.busy": "2023-08-23T10:26:43.291157Z",
56
+ "iopub.status.idle": "2023-08-23T10:26:44.037056Z",
57
+ "shell.execute_reply": "2023-08-23T10:26:44.036039Z"
58
+ },
59
+ "papermill": {
60
+ "duration": 0.751285,
61
+ "end_time": "2023-08-23T10:26:44.039482",
62
+ "exception": false,
63
+ "start_time": "2023-08-23T10:26:43.288197",
64
+ "status": "completed"
65
+ },
66
+ "tags": []
67
+ },
68
+ "outputs": [],
69
+ "source": [
70
+ "# First lets setup the various directories\n",
71
+ "!mkdir -p ../../model/\n",
72
+ "!mkdir -p ../../datapath/\n",
73
+ "!mkdir -p ../../checkpoint/"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 2,
79
+ "id": "b747f284",
80
+ "metadata": {
81
+ "execution": {
82
+ "iopub.execute_input": "2023-08-23T10:26:44.045133Z",
83
+ "iopub.status.busy": "2023-08-23T10:26:44.044626Z",
84
+ "iopub.status.idle": "2023-08-23T10:26:53.053696Z",
85
+ "shell.execute_reply": "2023-08-23T10:26:53.052569Z"
86
+ },
87
+ "papermill": {
88
+ "duration": 9.015161,
89
+ "end_time": "2023-08-23T10:26:53.056640",
90
+ "exception": false,
91
+ "start_time": "2023-08-23T10:26:44.041479",
92
+ "status": "completed"
93
+ },
94
+ "tags": []
95
+ },
96
+ "outputs": [
97
+ {
98
+ "name": "stdout",
99
+ "output_type": "stream",
100
+ "text": [
101
+ "[2023-08-23 10:26:48,317] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
102
+ ]
103
+ },
104
+ {
105
+ "name": "stdout",
106
+ "output_type": "stream",
107
+ "text": [
108
+ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n"
109
+ ]
110
+ },
111
+ {
112
+ "name": "stdout",
113
+ "output_type": "stream",
114
+ "text": [
115
+ "---- Initializing model ----\r\n",
116
+ "No of layers: 6\r\n",
117
+ "Embedding size: 512\r\n",
118
+ "Output model path: ../model/L6-D512-neox-init.pth\r\n",
119
+ "Vocab size: 50277\r\n",
120
+ "Note: this process takes a significant time (and ram) for large models\r\n",
121
+ "---- ----- ----\r\n"
122
+ ]
123
+ },
124
+ {
125
+ "name": "stdout",
126
+ "output_type": "stream",
127
+ "text": [
128
+ "Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\r\n"
129
+ ]
130
+ },
131
+ {
132
+ "name": "stdout",
133
+ "output_type": "stream",
134
+ "text": [
135
+ "Detected CUDA files, patching ldflags\r\n",
136
+ "Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/wkv_1_bf16/build.ninja...\r\n"
137
+ ]
138
+ },
139
+ {
140
+ "name": "stdout",
141
+ "output_type": "stream",
142
+ "text": [
143
+ "Building extension module wkv_1_bf16...\r\n",
144
+ "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n",
145
+ "ninja: no work to do.\r\n",
146
+ "Loading extension module wkv_1_bf16...\r\n"
147
+ ]
148
+ },
149
+ {
150
+ "name": "stdout",
151
+ "output_type": "stream",
152
+ "text": [
153
+ "[RWKV.model]: Finished initial model load\r\n",
154
+ "50277 512 -0.0001 emb.weight\r\n"
155
+ ]
156
+ },
157
+ {
158
+ "name": "stdout",
159
+ "output_type": "stream",
160
+ "text": [
161
+ "512 512 0 blocks.0.att.key.weight\r\n",
162
+ "512 512 1.0 blocks.0.att.value.weight\r\n",
163
+ "512 512 0 blocks.0.att.receptance.weight\r\n",
164
+ "512 512 0 blocks.0.att.output.weight\r\n",
165
+ "2048 512 1.0 blocks.0.ffn.key.weight\r\n"
166
+ ]
167
+ },
168
+ {
169
+ "name": "stdout",
170
+ "output_type": "stream",
171
+ "text": [
172
+ "512 512 0 blocks.0.ffn.receptance.weight\r\n",
173
+ "512 2048 0 blocks.0.ffn.value.weight\r\n",
174
+ "512 512 0 blocks.1.att.key.weight\r\n",
175
+ "512 512 1.0 blocks.1.att.value.weight\r\n"
176
+ ]
177
+ },
178
+ {
179
+ "name": "stdout",
180
+ "output_type": "stream",
181
+ "text": [
182
+ "512 512 0 blocks.1.att.receptance.weight\r\n",
183
+ "512 512 0 blocks.1.att.output.weight\r\n",
184
+ "2048 512 1.0 blocks.1.ffn.key.weight\r\n"
185
+ ]
186
+ },
187
+ {
188
+ "name": "stdout",
189
+ "output_type": "stream",
190
+ "text": [
191
+ "512 512 0 blocks.1.ffn.receptance.weight\r\n",
192
+ "512 2048 0 blocks.1.ffn.value.weight\r\n",
193
+ "512 512 0 blocks.2.att.key.weight\r\n",
194
+ "512 512 1.0 blocks.2.att.value.weight\r\n",
195
+ "512 512 0 blocks.2.att.receptance.weight\r\n",
196
+ "512 512 0 blocks.2.att.output.weight\r\n",
197
+ "2048 512 1.0 blocks.2.ffn.key.weight\r\n"
198
+ ]
199
+ },
200
+ {
201
+ "name": "stdout",
202
+ "output_type": "stream",
203
+ "text": [
204
+ "512 512 0 blocks.2.ffn.receptance.weight\r\n",
205
+ "512 2048 0 blocks.2.ffn.value.weight\r\n",
206
+ "512 512 0 blocks.3.att.key.weight\r\n",
207
+ "512 512 1.0 blocks.3.att.value.weight\r\n"
208
+ ]
209
+ },
210
+ {
211
+ "name": "stdout",
212
+ "output_type": "stream",
213
+ "text": [
214
+ "512 512 0 blocks.3.att.receptance.weight\r\n",
215
+ "512 512 0 blocks.3.att.output.weight\r\n",
216
+ "2048 512 1.0 blocks.3.ffn.key.weight\r\n"
217
+ ]
218
+ },
219
+ {
220
+ "name": "stdout",
221
+ "output_type": "stream",
222
+ "text": [
223
+ "512 512 0 blocks.3.ffn.receptance.weight\r\n",
224
+ "512 2048 0 blocks.3.ffn.value.weight\r\n",
225
+ "512 512 0 blocks.4.att.key.weight\r\n",
226
+ "512 512 1.0 blocks.4.att.value.weight\r\n",
227
+ "512 512 0 blocks.4.att.receptance.weight\r\n",
228
+ "512 512 0 blocks.4.att.output.weight\r\n",
229
+ "2048 512 1.0 blocks.4.ffn.key.weight\r\n"
230
+ ]
231
+ },
232
+ {
233
+ "name": "stdout",
234
+ "output_type": "stream",
235
+ "text": [
236
+ "512 512 0 blocks.4.ffn.receptance.weight\r\n",
237
+ "512 2048 0 blocks.4.ffn.value.weight\r\n",
238
+ "512 512 0 blocks.5.att.key.weight\r\n",
239
+ "512 512 1.0 blocks.5.att.value.weight\r\n"
240
+ ]
241
+ },
242
+ {
243
+ "name": "stdout",
244
+ "output_type": "stream",
245
+ "text": [
246
+ "512 512 0 blocks.5.att.receptance.weight\r\n",
247
+ "512 512 0 blocks.5.att.output.weight\r\n",
248
+ "2048 512 1.0 blocks.5.ffn.key.weight\r\n"
249
+ ]
250
+ },
251
+ {
252
+ "name": "stdout",
253
+ "output_type": "stream",
254
+ "text": [
255
+ "512 512 0 blocks.5.ffn.receptance.weight\r\n",
256
+ "512 2048 0 blocks.5.ffn.value.weight\r\n",
257
+ "50277 512 0.5 head.weight\r\n"
258
+ ]
259
+ }
260
+ ],
261
+ "source": [
262
+ "# Lets initialized the L6-D512 model with the init_model.py code\n",
263
+ "!cd ../../RWKV-v4neo/ && python3 init_model.py --n_layer 6 --n_embd 512 --vocab_size neox ../model/L6-D512-neox-init.pth"
264
+ ]
265
+ }
266
+ ],
267
+ "metadata": {
268
+ "kernelspec": {
269
+ "display_name": "rwkv-exp",
270
+ "language": "python",
271
+ "name": "python3"
272
+ },
273
+ "language_info": {
274
+ "codemirror_mode": {
275
+ "name": "ipython",
276
+ "version": 3
277
+ },
278
+ "file_extension": ".py",
279
+ "mimetype": "text/x-python",
280
+ "name": "python",
281
+ "nbconvert_exporter": "python",
282
+ "pygments_lexer": "ipython3",
283
+ "version": "3.10.12"
284
+ },
285
+ "papermill": {
286
+ "default_parameters": {},
287
+ "duration": 11.337437,
288
+ "end_time": "2023-08-23T10:26:53.383943",
289
+ "environment_variables": {},
290
+ "exception": null,
291
+ "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/trainer-v4-unit-test/model-init.ipynb",
292
+ "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/trainer-v4-unit-test/model-init.ipynb",
293
+ "parameters": {},
294
+ "start_time": "2023-08-23T10:26:42.046506",
295
+ "version": "2.4.0"
296
+ }
297
+ },
298
+ "nbformat": 4,
299
+ "nbformat_minor": 5
300
+ }