9wimu9 commited on
Commit
cf2ca1f
·
1 Parent(s): 1a04e96

Upload trainer.ipynb with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer.ipynb +1415 -0
trainer.ipynb ADDED
@@ -0,0 +1,1415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "6b7d6a9b-db7e-46b2-8ab8-d6914e18f1e1",
7
+ "metadata": {
8
+ "execution": {
9
+ "iopub.execute_input": "2023-07-10T17:34:42.903289Z",
10
+ "iopub.status.busy": "2023-07-10T17:34:42.902734Z",
11
+ "iopub.status.idle": "2023-07-10T17:34:53.910158Z",
12
+ "shell.execute_reply": "2023-07-10T17:34:53.909356Z",
13
+ "shell.execute_reply.started": "2023-07-10T17:34:42.903264Z"
14
+ }
15
+ },
16
+ "outputs": [
17
+ {
18
+ "name": "stdout",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
22
+ "\u001b[0m"
23
+ ]
24
+ }
25
+ ],
26
+ "source": [
27
+ "!pip install datasets transformers accelerate wandb -U -q"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 2,
33
+ "id": "a2a349ee-2749-4490-9807-cdf18f428181",
34
+ "metadata": {
35
+ "execution": {
36
+ "iopub.execute_input": "2023-07-10T17:34:53.914661Z",
37
+ "iopub.status.busy": "2023-07-10T17:34:53.914431Z",
38
+ "iopub.status.idle": "2023-07-10T17:35:00.111736Z",
39
+ "shell.execute_reply": "2023-07-10T17:35:00.111000Z",
40
+ "shell.execute_reply.started": "2023-07-10T17:34:53.914639Z"
41
+ }
42
+ },
43
+ "outputs": [
44
+ {
45
+ "name": "stderr",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n",
49
+ "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n",
50
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:"
51
+ ]
52
+ },
53
+ {
54
+ "name": "stdin",
55
+ "output_type": "stream",
56
+ "text": [
57
+ " ········································\n"
58
+ ]
59
+ },
60
+ {
61
+ "name": "stderr",
62
+ "output_type": "stream",
63
+ "text": [
64
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n"
65
+ ]
66
+ },
67
+ {
68
+ "data": {
69
+ "text/plain": [
70
+ "True"
71
+ ]
72
+ },
73
+ "execution_count": 2,
74
+ "metadata": {},
75
+ "output_type": "execute_result"
76
+ }
77
+ ],
78
+ "source": [
79
+ "import wandb\n",
80
+ "wandb.login()"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": 3,
86
+ "id": "bce905bd-4fc3-4a6a-bd76-48bba9ebc1d9",
87
+ "metadata": {
88
+ "execution": {
89
+ "iopub.execute_input": "2023-07-10T17:35:00.113303Z",
90
+ "iopub.status.busy": "2023-07-10T17:35:00.112906Z",
91
+ "iopub.status.idle": "2023-07-10T17:35:00.118570Z",
92
+ "shell.execute_reply": "2023-07-10T17:35:00.117737Z",
93
+ "shell.execute_reply.started": "2023-07-10T17:35:00.113271Z"
94
+ }
95
+ },
96
+ "outputs": [
97
+ {
98
+ "name": "stdout",
99
+ "output_type": "stream",
100
+ "text": [
101
+ "env: WANDB_PROJECT=sinhala_bert_v1.2\n"
102
+ ]
103
+ }
104
+ ],
105
+ "source": [
106
+ "%env WANDB_PROJECT=sinhala_bert_v1.2"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 4,
112
+ "id": "2551a71d-2804-48ed-bf85-6e0aa94d47d8",
113
+ "metadata": {
114
+ "execution": {
115
+ "iopub.execute_input": "2023-07-10T17:35:00.120776Z",
116
+ "iopub.status.busy": "2023-07-10T17:35:00.120489Z",
117
+ "iopub.status.idle": "2023-07-10T17:35:00.124426Z",
118
+ "shell.execute_reply": "2023-07-10T17:35:00.123689Z",
119
+ "shell.execute_reply.started": "2023-07-10T17:35:00.120749Z"
120
+ }
121
+ },
122
+ "outputs": [],
123
+ "source": [
124
+ "model_checkpoint = \"9wimu9/sinhala-bert-1\"\n",
125
+ "tokenizer_checkpoint= \"9wimu9/sinhala-bert-1\""
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 5,
131
+ "id": "137a18d4-6fc5-4bfe-bf3c-54f4bdef5f4b",
132
+ "metadata": {
133
+ "execution": {
134
+ "iopub.execute_input": "2023-07-10T17:35:00.126040Z",
135
+ "iopub.status.busy": "2023-07-10T17:35:00.125473Z",
136
+ "iopub.status.idle": "2023-07-10T17:36:06.352581Z",
137
+ "shell.execute_reply": "2023-07-10T17:36:06.351983Z",
138
+ "shell.execute_reply.started": "2023-07-10T17:35:00.126013Z"
139
+ }
140
+ },
141
+ "outputs": [
142
+ {
143
+ "data": {
144
+ "application/vnd.jupyter.widget-view+json": {
145
+ "model_id": "d7d3d422d87f467f899071b2c4ed86b4",
146
+ "version_major": 2,
147
+ "version_minor": 0
148
+ },
149
+ "text/plain": [
150
+ "Downloading readme: 0%| | 0.00/608 [00:00<?, ?B/s]"
151
+ ]
152
+ },
153
+ "metadata": {},
154
+ "output_type": "display_data"
155
+ },
156
+ {
157
+ "name": "stdout",
158
+ "output_type": "stream",
159
+ "text": [
160
+ "Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/9wimu9___parquet/9wimu9--sinhala_30m_tokenized-4ef7deb3027f7158/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n"
161
+ ]
162
+ },
163
+ {
164
+ "data": {
165
+ "application/vnd.jupyter.widget-view+json": {
166
+ "model_id": "d0f8e126c22c4fd88dc49b3ad4cf2d22",
167
+ "version_major": 2,
168
+ "version_minor": 0
169
+ },
170
+ "text/plain": [
171
+ "Downloading data files: 0%| | 0/3 [00:00<?, ?it/s]"
172
+ ]
173
+ },
174
+ "metadata": {},
175
+ "output_type": "display_data"
176
+ },
177
+ {
178
+ "data": {
179
+ "application/vnd.jupyter.widget-view+json": {
180
+ "model_id": "fd3367b878d54022a264d978e91f8b0e",
181
+ "version_major": 2,
182
+ "version_minor": 0
183
+ },
184
+ "text/plain": [
185
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
186
+ ]
187
+ },
188
+ "metadata": {},
189
+ "output_type": "display_data"
190
+ },
191
+ {
192
+ "data": {
193
+ "application/vnd.jupyter.widget-view+json": {
194
+ "model_id": "3940cbda3f324a4dbd6ae25c81c89a0b",
195
+ "version_major": 2,
196
+ "version_minor": 0
197
+ },
198
+ "text/plain": [
199
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
200
+ ]
201
+ },
202
+ "metadata": {},
203
+ "output_type": "display_data"
204
+ },
205
+ {
206
+ "data": {
207
+ "application/vnd.jupyter.widget-view+json": {
208
+ "model_id": "234a689e98054963b50ccb99a09bbc76",
209
+ "version_major": 2,
210
+ "version_minor": 0
211
+ },
212
+ "text/plain": [
213
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
214
+ ]
215
+ },
216
+ "metadata": {},
217
+ "output_type": "display_data"
218
+ },
219
+ {
220
+ "data": {
221
+ "application/vnd.jupyter.widget-view+json": {
222
+ "model_id": "544d17642acf492c92958c5348c5e3a4",
223
+ "version_major": 2,
224
+ "version_minor": 0
225
+ },
226
+ "text/plain": [
227
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
228
+ ]
229
+ },
230
+ "metadata": {},
231
+ "output_type": "display_data"
232
+ },
233
+ {
234
+ "data": {
235
+ "application/vnd.jupyter.widget-view+json": {
236
+ "model_id": "06a99252f2e2433e951566252b3d5f22",
237
+ "version_major": 2,
238
+ "version_minor": 0
239
+ },
240
+ "text/plain": [
241
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
242
+ ]
243
+ },
244
+ "metadata": {},
245
+ "output_type": "display_data"
246
+ },
247
+ {
248
+ "data": {
249
+ "application/vnd.jupyter.widget-view+json": {
250
+ "model_id": "79276ae55ce14e22bd45a0e5606fbd48",
251
+ "version_major": 2,
252
+ "version_minor": 0
253
+ },
254
+ "text/plain": [
255
+ "Downloading data: 0%| | 0.00/114M [00:00<?, ?B/s]"
256
+ ]
257
+ },
258
+ "metadata": {},
259
+ "output_type": "display_data"
260
+ },
261
+ {
262
+ "data": {
263
+ "application/vnd.jupyter.widget-view+json": {
264
+ "model_id": "41c589dddbf14ac489f62f9bcf5a28ce",
265
+ "version_major": 2,
266
+ "version_minor": 0
267
+ },
268
+ "text/plain": [
269
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
270
+ ]
271
+ },
272
+ "metadata": {},
273
+ "output_type": "display_data"
274
+ },
275
+ {
276
+ "data": {
277
+ "application/vnd.jupyter.widget-view+json": {
278
+ "model_id": "cf5c2a8975bf4353992067eaeeea1743",
279
+ "version_major": 2,
280
+ "version_minor": 0
281
+ },
282
+ "text/plain": [
283
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
284
+ ]
285
+ },
286
+ "metadata": {},
287
+ "output_type": "display_data"
288
+ },
289
+ {
290
+ "data": {
291
+ "application/vnd.jupyter.widget-view+json": {
292
+ "model_id": "139953de43e046fbb85bca9be7155d28",
293
+ "version_major": 2,
294
+ "version_minor": 0
295
+ },
296
+ "text/plain": [
297
+ "Downloading data: 0%| | 0.00/114M [00:00<?, ?B/s]"
298
+ ]
299
+ },
300
+ "metadata": {},
301
+ "output_type": "display_data"
302
+ },
303
+ {
304
+ "data": {
305
+ "application/vnd.jupyter.widget-view+json": {
306
+ "model_id": "c455b35acc6a4ee2a6c76c65e4a29d45",
307
+ "version_major": 2,
308
+ "version_minor": 0
309
+ },
310
+ "text/plain": [
311
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
312
+ ]
313
+ },
314
+ "metadata": {},
315
+ "output_type": "display_data"
316
+ },
317
+ {
318
+ "data": {
319
+ "application/vnd.jupyter.widget-view+json": {
320
+ "model_id": "dac6e8d5597544e883a13498159cddb5",
321
+ "version_major": 2,
322
+ "version_minor": 0
323
+ },
324
+ "text/plain": [
325
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
326
+ ]
327
+ },
328
+ "metadata": {},
329
+ "output_type": "display_data"
330
+ },
331
+ {
332
+ "data": {
333
+ "application/vnd.jupyter.widget-view+json": {
334
+ "model_id": "92fbb207ac494845af3d94bb542576cf",
335
+ "version_major": 2,
336
+ "version_minor": 0
337
+ },
338
+ "text/plain": [
339
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
340
+ ]
341
+ },
342
+ "metadata": {},
343
+ "output_type": "display_data"
344
+ },
345
+ {
346
+ "data": {
347
+ "application/vnd.jupyter.widget-view+json": {
348
+ "model_id": "28b3fa3e947b418a8c963429b9a1dd41",
349
+ "version_major": 2,
350
+ "version_minor": 0
351
+ },
352
+ "text/plain": [
353
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
354
+ ]
355
+ },
356
+ "metadata": {},
357
+ "output_type": "display_data"
358
+ },
359
+ {
360
+ "data": {
361
+ "application/vnd.jupyter.widget-view+json": {
362
+ "model_id": "86caa15d02994bda8775768afbd85b94",
363
+ "version_major": 2,
364
+ "version_minor": 0
365
+ },
366
+ "text/plain": [
367
+ "Downloading data: 0%| | 0.00/114M [00:00<?, ?B/s]"
368
+ ]
369
+ },
370
+ "metadata": {},
371
+ "output_type": "display_data"
372
+ },
373
+ {
374
+ "data": {
375
+ "application/vnd.jupyter.widget-view+json": {
376
+ "model_id": "2076c346b559410bb151663a2707813c",
377
+ "version_major": 2,
378
+ "version_minor": 0
379
+ },
380
+ "text/plain": [
381
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
382
+ ]
383
+ },
384
+ "metadata": {},
385
+ "output_type": "display_data"
386
+ },
387
+ {
388
+ "data": {
389
+ "application/vnd.jupyter.widget-view+json": {
390
+ "model_id": "d9ae84724e4e4073aded53ba05f53743",
391
+ "version_major": 2,
392
+ "version_minor": 0
393
+ },
394
+ "text/plain": [
395
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
396
+ ]
397
+ },
398
+ "metadata": {},
399
+ "output_type": "display_data"
400
+ },
401
+ {
402
+ "data": {
403
+ "application/vnd.jupyter.widget-view+json": {
404
+ "model_id": "55f54d71a81c43f5affe26ee2a5cdf6a",
405
+ "version_major": 2,
406
+ "version_minor": 0
407
+ },
408
+ "text/plain": [
409
+ "Downloading data: 0%| | 0.00/114M [00:00<?, ?B/s]"
410
+ ]
411
+ },
412
+ "metadata": {},
413
+ "output_type": "display_data"
414
+ },
415
+ {
416
+ "data": {
417
+ "application/vnd.jupyter.widget-view+json": {
418
+ "model_id": "da771932e1ae443eb83a6882441593fb",
419
+ "version_major": 2,
420
+ "version_minor": 0
421
+ },
422
+ "text/plain": [
423
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
424
+ ]
425
+ },
426
+ "metadata": {},
427
+ "output_type": "display_data"
428
+ },
429
+ {
430
+ "data": {
431
+ "application/vnd.jupyter.widget-view+json": {
432
+ "model_id": "2bb6366bdd4545f895d73c8e556f8e85",
433
+ "version_major": 2,
434
+ "version_minor": 0
435
+ },
436
+ "text/plain": [
437
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
438
+ ]
439
+ },
440
+ "metadata": {},
441
+ "output_type": "display_data"
442
+ },
443
+ {
444
+ "data": {
445
+ "application/vnd.jupyter.widget-view+json": {
446
+ "model_id": "5553d13a989c48d39df14955eb4701e2",
447
+ "version_major": 2,
448
+ "version_minor": 0
449
+ },
450
+ "text/plain": [
451
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
452
+ ]
453
+ },
454
+ "metadata": {},
455
+ "output_type": "display_data"
456
+ },
457
+ {
458
+ "data": {
459
+ "application/vnd.jupyter.widget-view+json": {
460
+ "model_id": "adedd757f27240c989029c243d5ee76d",
461
+ "version_major": 2,
462
+ "version_minor": 0
463
+ },
464
+ "text/plain": [
465
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
466
+ ]
467
+ },
468
+ "metadata": {},
469
+ "output_type": "display_data"
470
+ },
471
+ {
472
+ "data": {
473
+ "application/vnd.jupyter.widget-view+json": {
474
+ "model_id": "c2e6754349e14f7da1cfd44e8cc23e11",
475
+ "version_major": 2,
476
+ "version_minor": 0
477
+ },
478
+ "text/plain": [
479
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
480
+ ]
481
+ },
482
+ "metadata": {},
483
+ "output_type": "display_data"
484
+ },
485
+ {
486
+ "data": {
487
+ "application/vnd.jupyter.widget-view+json": {
488
+ "model_id": "a242424a016a4f3f86e2e74683358b28",
489
+ "version_major": 2,
490
+ "version_minor": 0
491
+ },
492
+ "text/plain": [
493
+ "Downloading data: 0%| | 0.00/115M [00:00<?, ?B/s]"
494
+ ]
495
+ },
496
+ "metadata": {},
497
+ "output_type": "display_data"
498
+ },
499
+ {
500
+ "data": {
501
+ "application/vnd.jupyter.widget-view+json": {
502
+ "model_id": "13deb04a7b294cbe8c5365aa5fb7f037",
503
+ "version_major": 2,
504
+ "version_minor": 0
505
+ },
506
+ "text/plain": [
507
+ "Downloading data: 0%| | 0.00/73.2M [00:00<?, ?B/s]"
508
+ ]
509
+ },
510
+ "metadata": {},
511
+ "output_type": "display_data"
512
+ },
513
+ {
514
+ "data": {
515
+ "application/vnd.jupyter.widget-view+json": {
516
+ "model_id": "a7045bb0d6034a80b14fe5faca6dc8cf",
517
+ "version_major": 2,
518
+ "version_minor": 0
519
+ },
520
+ "text/plain": [
521
+ "Downloading data: 0%| | 0.00/73.2M [00:00<?, ?B/s]"
522
+ ]
523
+ },
524
+ "metadata": {},
525
+ "output_type": "display_data"
526
+ },
527
+ {
528
+ "data": {
529
+ "application/vnd.jupyter.widget-view+json": {
530
+ "model_id": "179036346c214ebd8f8286ad1c097455",
531
+ "version_major": 2,
532
+ "version_minor": 0
533
+ },
534
+ "text/plain": [
535
+ "Downloading data: 0%| | 0.00/73.1M [00:00<?, ?B/s]"
536
+ ]
537
+ },
538
+ "metadata": {},
539
+ "output_type": "display_data"
540
+ },
541
+ {
542
+ "data": {
543
+ "application/vnd.jupyter.widget-view+json": {
544
+ "model_id": "0ed65fd339cb4293b9d2a312e6012b08",
545
+ "version_major": 2,
546
+ "version_minor": 0
547
+ },
548
+ "text/plain": [
549
+ "Downloading data: 0%| | 0.00/73.1M [00:00<?, ?B/s]"
550
+ ]
551
+ },
552
+ "metadata": {},
553
+ "output_type": "display_data"
554
+ },
555
+ {
556
+ "data": {
557
+ "application/vnd.jupyter.widget-view+json": {
558
+ "model_id": "27d5e5b89c114454b831fc3c4e3fb80b",
559
+ "version_major": 2,
560
+ "version_minor": 0
561
+ },
562
+ "text/plain": [
563
+ "Extracting data files: 0%| | 0/3 [00:00<?, ?it/s]"
564
+ ]
565
+ },
566
+ "metadata": {},
567
+ "output_type": "display_data"
568
+ },
569
+ {
570
+ "data": {
571
+ "application/vnd.jupyter.widget-view+json": {
572
+ "model_id": "13e100995a514014b5f2033c045ecafa",
573
+ "version_major": 2,
574
+ "version_minor": 0
575
+ },
576
+ "text/plain": [
577
+ "Generating train split: 0%| | 0/7310725 [00:00<?, ? examples/s]"
578
+ ]
579
+ },
580
+ "metadata": {},
581
+ "output_type": "display_data"
582
+ },
583
+ {
584
+ "data": {
585
+ "application/vnd.jupyter.widget-view+json": {
586
+ "model_id": "21391aec64e5442e93395b4e4a0db3ea",
587
+ "version_major": 2,
588
+ "version_minor": 0
589
+ },
590
+ "text/plain": [
591
+ "Generating test split: 0%| | 0/406414 [00:00<?, ? examples/s]"
592
+ ]
593
+ },
594
+ "metadata": {},
595
+ "output_type": "display_data"
596
+ },
597
+ {
598
+ "data": {
599
+ "application/vnd.jupyter.widget-view+json": {
600
+ "model_id": "aa8eba7dc1e749b4a9ba497d00715161",
601
+ "version_major": 2,
602
+ "version_minor": 0
603
+ },
604
+ "text/plain": [
605
+ "Generating valid split: 0%| | 0/405841 [00:00<?, ? examples/s]"
606
+ ]
607
+ },
608
+ "metadata": {},
609
+ "output_type": "display_data"
610
+ },
611
+ {
612
+ "name": "stdout",
613
+ "output_type": "stream",
614
+ "text": [
615
+ "Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/9wimu9___parquet/9wimu9--sinhala_30m_tokenized-4ef7deb3027f7158/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n"
616
+ ]
617
+ },
618
+ {
619
+ "data": {
620
+ "application/vnd.jupyter.widget-view+json": {
621
+ "model_id": "2bc96a6692d543f1b4bda73c6c03d592",
622
+ "version_major": 2,
623
+ "version_minor": 0
624
+ },
625
+ "text/plain": [
626
+ " 0%| | 0/3 [00:00<?, ?it/s]"
627
+ ]
628
+ },
629
+ "metadata": {},
630
+ "output_type": "display_data"
631
+ },
632
+ {
633
+ "data": {
634
+ "text/plain": [
635
+ "DatasetDict({\n",
636
+ " train: Dataset({\n",
637
+ " features: ['input_ids', 'attention_mask', 'special_tokens_mask'],\n",
638
+ " num_rows: 7310725\n",
639
+ " })\n",
640
+ " test: Dataset({\n",
641
+ " features: ['input_ids', 'attention_mask', 'special_tokens_mask'],\n",
642
+ " num_rows: 406414\n",
643
+ " })\n",
644
+ " valid: Dataset({\n",
645
+ " features: ['input_ids', 'attention_mask', 'special_tokens_mask'],\n",
646
+ " num_rows: 405841\n",
647
+ " })\n",
648
+ "})"
649
+ ]
650
+ },
651
+ "execution_count": 5,
652
+ "metadata": {},
653
+ "output_type": "execute_result"
654
+ }
655
+ ],
656
+ "source": [
657
+ "from datasets import load_dataset\n",
658
+ "lm_datasets = load_dataset('9wimu9/sinhala_30m_tokenized')\n",
659
+ "lm_datasets"
660
+ ]
661
+ },
662
+ {
663
+ "cell_type": "code",
664
+ "execution_count": 6,
665
+ "id": "e81c4c2a-d6e2-4a41-bcef-218c205d9544",
666
+ "metadata": {
667
+ "execution": {
668
+ "iopub.execute_input": "2023-07-10T17:37:42.118612Z",
669
+ "iopub.status.busy": "2023-07-10T17:37:42.117810Z",
670
+ "iopub.status.idle": "2023-07-10T17:37:48.570390Z",
671
+ "shell.execute_reply": "2023-07-10T17:37:48.569592Z",
672
+ "shell.execute_reply.started": "2023-07-10T17:37:42.118586Z"
673
+ }
674
+ },
675
+ "outputs": [
676
+ {
677
+ "data": {
678
+ "text/plain": [
679
+ "RobertaConfig {\n",
680
+ " \"_name_or_path\": \"/notebooks/roberta-large-pretrained-si\",\n",
681
+ " \"architectures\": [\n",
682
+ " \"RobertaForMaskedLM\"\n",
683
+ " ],\n",
684
+ " \"attention_probs_dropout_prob\": 0.1,\n",
685
+ " \"bos_token_id\": 0,\n",
686
+ " \"classifier_dropout\": null,\n",
687
+ " \"eos_token_id\": 2,\n",
688
+ " \"hidden_act\": \"gelu\",\n",
689
+ " \"hidden_dropout_prob\": 0.1,\n",
690
+ " \"hidden_size\": 1024,\n",
691
+ " \"initializer_range\": 0.02,\n",
692
+ " \"intermediate_size\": 4096,\n",
693
+ " \"layer_norm_eps\": 1e-05,\n",
694
+ " \"max_position_embeddings\": 514,\n",
695
+ " \"model_type\": \"roberta\",\n",
696
+ " \"num_attention_heads\": 16,\n",
697
+ " \"num_hidden_layers\": 24,\n",
698
+ " \"pad_token_id\": 1,\n",
699
+ " \"position_embedding_type\": \"absolute\",\n",
700
+ " \"transformers_version\": \"4.30.2\",\n",
701
+ " \"type_vocab_size\": 1,\n",
702
+ " \"use_cache\": true,\n",
703
+ " \"vocab_size\": 12500\n",
704
+ "}"
705
+ ]
706
+ },
707
+ "execution_count": 6,
708
+ "metadata": {},
709
+ "output_type": "execute_result"
710
+ }
711
+ ],
712
+ "source": [
713
+ "from transformers import AutoConfig, AutoModelForMaskedLM\n",
714
+ "# config = AutoConfig.from_pretrained(model_checkpoint)\n",
715
+ "config = AutoConfig.from_pretrained('/notebooks/roberta-large-pretrained-si')\n",
716
+ "\n",
717
+ "model = AutoModelForMaskedLM.from_config(config)\n",
718
+ "config"
719
+ ]
720
+ },
721
+ {
722
+ "cell_type": "code",
723
+ "execution_count": 7,
724
+ "id": "065f3958-2b05-4a5a-8f05-b049c14fb5f0",
725
+ "metadata": {
726
+ "execution": {
727
+ "iopub.execute_input": "2023-07-10T09:32:55.634897Z",
728
+ "iopub.status.busy": "2023-07-10T09:32:55.634368Z",
729
+ "iopub.status.idle": "2023-07-10T09:32:55.640686Z",
730
+ "shell.execute_reply": "2023-07-10T09:32:55.640297Z",
731
+ "shell.execute_reply.started": "2023-07-10T09:32:55.634879Z"
732
+ }
733
+ },
734
+ "outputs": [],
735
+ "source": [
736
+ "# max_token_size=128\n",
737
+ "# use model architecture -> BERT large\n",
738
+ "# 24 layers, 1,024 dimensions, 16 heads, 4,096 hidden dimensions in the feed-forward layer, with pre-layer normalization\n",
739
+ "\n",
740
+ "\n",
741
+ "# We follow the optimization of RoBERTa (Liu et al., 2019) and use \n",
742
+ "# AdamW (Loshchilov and Hutter, 2019) with \n",
743
+ "# β1 = 0.9, β2 = 0.98, ε = 1e-6, \n",
744
+ "# weight decay of 0.01, dropout 0.1, and \n",
745
+ "# attention dropout 0.1.\n",
746
+ "\n",
747
+ "\n",
748
+ "# Hyperparameters\n",
749
+ "\n",
750
+ "# batch size -> 4k, 8k, and 16k (via gradient accumilation)\n",
751
+ "\n",
752
+ "# Warmup Proportion (wu) We determine the number of warmup steps as a proportion of the total number of steps. \n",
753
+ "# Specifically, we try 0%, 2%, 4%, and 6%, which all reflect significantly fewer warmup steps than in BERT.\n",
754
+ "\n",
755
+ "# Peak Learning Rate (lr) Our linear learning rate scheduler, \n",
756
+ "# which starts at 0, warms up to the peak learning rate, and then decays back to 0. We try 5e-4, 1e-3, and 2e-3\n",
757
+ "\n"
758
+ ]
759
+ },
760
+ {
761
+ "cell_type": "code",
762
+ "execution_count": 7,
763
+ "id": "858cd60b-32c4-4c0f-859e-10a1ee3bf68e",
764
+ "metadata": {
765
+ "execution": {
766
+ "iopub.execute_input": "2023-07-10T17:37:48.572108Z",
767
+ "iopub.status.busy": "2023-07-10T17:37:48.571665Z",
768
+ "iopub.status.idle": "2023-07-10T17:37:48.610050Z",
769
+ "shell.execute_reply": "2023-07-10T17:37:48.609409Z",
770
+ "shell.execute_reply.started": "2023-07-10T17:37:48.572101Z"
771
+ }
772
+ },
773
+ "outputs": [],
774
+ "source": [
775
+ "from transformers import AutoTokenizer\n",
776
+ "# tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint,model_max_length=256)\n",
777
+ "tokenizer = AutoTokenizer.from_pretrained('/notebooks/roberta-large-pretrained-si',model_max_length=256)"
778
+ ]
779
+ },
780
+ {
781
+ "cell_type": "code",
782
+ "execution_count": 13,
783
+ "id": "5812f8da-3434-4ec8-a2e6-a6bdc30ecf72",
784
+ "metadata": {
785
+ "execution": {
786
+ "iopub.execute_input": "2023-07-10T17:38:51.772892Z",
787
+ "iopub.status.busy": "2023-07-10T17:38:51.772628Z",
788
+ "iopub.status.idle": "2023-07-10T17:38:51.777952Z",
789
+ "shell.execute_reply": "2023-07-10T17:38:51.777265Z",
790
+ "shell.execute_reply.started": "2023-07-10T17:38:51.772871Z"
791
+ }
792
+ },
793
+ "outputs": [
794
+ {
795
+ "data": {
796
+ "text/plain": [
797
+ "RobertaTokenizerFast(name_or_path='/notebooks/roberta-large-pretrained-si', vocab_size=1868, model_max_length=256, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken(\"<mask>\", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)"
798
+ ]
799
+ },
800
+ "execution_count": 13,
801
+ "metadata": {},
802
+ "output_type": "execute_result"
803
+ }
804
+ ],
805
+ "source": [
806
+ "tokenizer"
807
+ ]
808
+ },
809
+ {
810
+ "cell_type": "code",
811
+ "execution_count": 9,
812
+ "id": "0905ef8c-9faa-49d6-ad0a-06753ce856fa",
813
+ "metadata": {
814
+ "execution": {
815
+ "iopub.execute_input": "2023-07-10T17:37:49.993189Z",
816
+ "iopub.status.busy": "2023-07-10T17:37:49.992541Z",
817
+ "iopub.status.idle": "2023-07-10T17:37:49.996729Z",
818
+ "shell.execute_reply": "2023-07-10T17:37:49.996008Z",
819
+ "shell.execute_reply.started": "2023-07-10T17:37:49.993157Z"
820
+ }
821
+ },
822
+ "outputs": [],
823
+ "source": [
824
+ "per_device_train_batch_size=400\n",
825
+ "gradient_accumulation_steps=10\n",
826
+ "num_train_epochs=1\n",
827
+ "warmup_rate=0.01"
828
+ ]
829
+ },
830
+ {
831
+ "cell_type": "code",
832
+ "execution_count": 10,
833
+ "id": "6056f333-46f9-4bea-a93d-423f3a1a959e",
834
+ "metadata": {
835
+ "execution": {
836
+ "iopub.execute_input": "2023-07-10T17:37:55.793688Z",
837
+ "iopub.status.busy": "2023-07-10T17:37:55.792933Z",
838
+ "iopub.status.idle": "2023-07-10T17:37:58.921474Z",
839
+ "shell.execute_reply": "2023-07-10T17:37:58.920666Z",
840
+ "shell.execute_reply.started": "2023-07-10T17:37:55.793660Z"
841
+ }
842
+ },
843
+ "outputs": [],
844
+ "source": [
845
+ "from transformers import TrainingArguments\n",
846
+ "training_args = TrainingArguments(\n",
847
+ " model_checkpoint,\n",
848
+ " evaluation_strategy = \"epoch\",\n",
849
+ " # push_to_hub=True,\n",
850
+ " # hub_model_id=\"sinhala-bert-v.1\",\n",
851
+ " per_device_train_batch_size=per_device_train_batch_size, # 4000,8000,16000\n",
852
+ " gradient_accumulation_steps=gradient_accumulation_steps,\n",
853
+ " gradient_checkpointing=True,\n",
854
+ " fp16=True,\n",
855
+ " report_to=\"wandb\", \n",
856
+ " num_train_epochs=num_train_epochs,\n",
857
+ " no_cuda=False,\n",
858
+ " logging_steps=1,\n",
859
+ " save_steps=25,\n",
860
+ " save_total_limit=3,\n",
861
+ " # load_best_model_at_end=True, # whether to load the best model (in terms of loss) at the end of training\n",
862
+ ")\n"
863
+ ]
864
+ },
865
+ {
866
+ "cell_type": "code",
867
+ "execution_count": 11,
868
+ "id": "7f6078f0-ba64-4509-ac8f-39dd0cd7fe04",
869
+ "metadata": {
870
+ "execution": {
871
+ "iopub.execute_input": "2023-07-10T17:38:00.867885Z",
872
+ "iopub.status.busy": "2023-07-10T17:38:00.867375Z",
873
+ "iopub.status.idle": "2023-07-10T17:38:00.876595Z",
874
+ "shell.execute_reply": "2023-07-10T17:38:00.875989Z",
875
+ "shell.execute_reply.started": "2023-07-10T17:38:00.867857Z"
876
+ }
877
+ },
878
+ "outputs": [
879
+ {
880
+ "data": {
881
+ "text/plain": [
882
+ "(7310725, 1828, 18, 4000)"
883
+ ]
884
+ },
885
+ "execution_count": 11,
886
+ "metadata": {},
887
+ "output_type": "execute_result"
888
+ }
889
+ ],
890
+ "source": [
891
+ "from transformers import get_polynomial_decay_schedule_with_warmup,AdamW,get_linear_schedule_with_warmup\n",
892
+ "import math,torch\n",
893
+ "\n",
894
+ "params = filter(lambda x: x.requires_grad, model.parameters())\n",
895
+ "\n",
896
+ "optimizer = torch.optim.AdamW(params,lr=2e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.01)\n",
897
+ "\n",
898
+ "batch_size = per_device_train_batch_size*gradient_accumulation_steps\n",
899
+ "\n",
900
+ "num_warmup_steps = math.ceil(lm_datasets[\"train\"].num_rows / batch_size) * warmup_rate*num_train_epochs\n",
901
+ "num_warmup_steps = int(num_warmup_steps)\n",
902
+ "num_training_steps = math.ceil(lm_datasets[\"train\"].num_rows / batch_size) * num_train_epochs\n",
903
+ "\n",
904
+ "\n",
905
+ "scheduler = get_linear_schedule_with_warmup(optimizer,\n",
906
+ " num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)\n",
907
+ "\n",
908
+ "lm_datasets[\"train\"].num_rows,num_training_steps,num_warmup_steps,batch_size"
909
+ ]
910
+ },
911
+ {
912
+ "cell_type": "code",
913
+ "execution_count": 12,
914
+ "id": "ebf14d20-e630-4961-a6d4-d9c8fa90e941",
915
+ "metadata": {
916
+ "execution": {
917
+ "iopub.execute_input": "2023-07-10T17:38:05.602802Z",
918
+ "iopub.status.busy": "2023-07-10T17:38:05.602191Z",
919
+ "iopub.status.idle": "2023-07-10T17:38:11.030425Z",
920
+ "shell.execute_reply": "2023-07-10T17:38:11.029681Z",
921
+ "shell.execute_reply.started": "2023-07-10T17:38:05.602778Z"
922
+ }
923
+ },
924
+ "outputs": [
925
+ {
926
+ "name": "stdout",
927
+ "output_type": "stream",
928
+ "text": [
929
+ "Reading package lists... Done\n",
930
+ "Building dependency tree \n",
931
+ "Reading state information... Done\n",
932
+ "The following NEW packages will be installed:\n",
933
+ " git-lfs\n",
934
+ "0 upgraded, 1 newly installed, 0 to remove and 3 not upgraded.\n",
935
+ "Need to get 3316 kB of archives.\n",
936
+ "After this operation, 11.1 MB of additional disk space will be used.\n",
937
+ "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 git-lfs amd64 2.9.2-1 [3316 kB]\n",
938
+ "Fetched 3316 kB in 1s (3375 kB/s) \u001b[0m33m\u001b[33m\n",
939
+ "\n",
940
+ "\u001b7\u001b[0;23r\u001b8\u001b[1ASelecting previously unselected package git-lfs.\n",
941
+ "(Reading database ... 69943 files and directories currently installed.)\n",
942
+ "Preparing to unpack .../git-lfs_2.9.2-1_amd64.deb ...\n",
943
+ "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 0%]\u001b[49m\u001b[39m [..........................................................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 20%]\u001b[49m\u001b[39m [###########...............................................] \u001b8Unpacking git-lfs (2.9.2-1) ...\n",
944
+ "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 40%]\u001b[49m\u001b[39m [#######################...................................] \u001b8Setting up git-lfs (2.9.2-1) ...\n",
945
+ "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 60%]\u001b[49m\u001b[39m [##################################........................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 80%]\u001b[49m\u001b[39m [##############################################............] \u001b8Processing triggers for man-db (2.9.1-1) ...\n",
946
+ "\n",
947
+ "\u001b7\u001b[0;24r\u001b8\u001b[1A\u001b[J"
948
+ ]
949
+ }
950
+ ],
951
+ "source": [
952
+ "!sudo apt install git-lfs"
953
+ ]
954
+ },
955
+ {
956
+ "cell_type": "code",
957
+ "execution_count": 19,
958
+ "id": "632113ee-cdcf-45a9-a325-60eaaa1b5f1c",
959
+ "metadata": {
960
+ "execution": {
961
+ "iopub.execute_input": "2023-07-10T18:25:28.092991Z",
962
+ "iopub.status.busy": "2023-07-10T18:25:28.092179Z",
963
+ "iopub.status.idle": "2023-07-10T18:25:35.965867Z",
964
+ "shell.execute_reply": "2023-07-10T18:25:35.965309Z",
965
+ "shell.execute_reply.started": "2023-07-10T18:25:28.092953Z"
966
+ }
967
+ },
968
+ "outputs": [],
969
+ "source": [
970
+ "# from transformers import RobertaForMaskedLM\n",
971
+ "# model = RobertaForMaskedLM.from_pretrained(\"/notebooks/9wimu9/sinhala-bert-1/checkpoint-1625\")"
972
+ ]
973
+ },
974
+ {
975
+ "cell_type": "code",
976
+ "execution_count": 21,
977
+ "id": "969484c6-4035-4234-8ac7-209ab4a014bc",
978
+ "metadata": {
979
+ "execution": {
980
+ "iopub.execute_input": "2023-07-10T18:25:50.083080Z",
981
+ "iopub.status.busy": "2023-07-10T18:25:50.082571Z",
982
+ "iopub.status.idle": "2023-07-10T18:25:50.269795Z",
983
+ "shell.execute_reply": "2023-07-10T18:25:50.269277Z",
984
+ "shell.execute_reply.started": "2023-07-10T18:25:50.083058Z"
985
+ }
986
+ },
987
+ "outputs": [],
988
+ "source": [
989
+ "from transformers import DataCollatorForLanguageModeling,Trainer\n",
990
+ "\n",
991
+ "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)\n",
992
+ "\n",
993
+ "trainer = Trainer(\n",
994
+ " model=model,\n",
995
+ " args=training_args,\n",
996
+ " train_dataset=lm_datasets[\"train\"],\n",
997
+ " eval_dataset=lm_datasets[\"valid\"],\n",
998
+ " data_collator=data_collator,\n",
999
+ " optimizers=[optimizer, scheduler]\n",
1000
+ ")"
1001
+ ]
1002
+ },
1003
+ {
1004
+ "cell_type": "code",
1005
+ "execution_count": 14,
1006
+ "id": "4c2f4490-b3bc-4ec6-bef1-2bd71933369a",
1007
+ "metadata": {
1008
+ "execution": {
1009
+ "iopub.execute_input": "2023-07-10T15:10:13.622770Z",
1010
+ "iopub.status.busy": "2023-07-10T15:10:13.622142Z",
1011
+ "iopub.status.idle": "2023-07-10T15:10:13.625595Z",
1012
+ "shell.execute_reply": "2023-07-10T15:10:13.625073Z",
1013
+ "shell.execute_reply.started": "2023-07-10T15:10:13.622747Z"
1014
+ }
1015
+ },
1016
+ "outputs": [],
1017
+ "source": [
1018
+ "wandb.finish()\n",
1019
+ "# wandb.init()"
1020
+ ]
1021
+ },
1022
+ {
1023
+ "cell_type": "code",
1024
+ "execution_count": 15,
1025
+ "id": "17979cc2-2e66-4055-aabb-29d9ee90112d",
1026
+ "metadata": {
1027
+ "execution": {
1028
+ "iopub.execute_input": "2023-07-08T07:31:19.523715Z",
1029
+ "iopub.status.busy": "2023-07-08T07:31:19.523529Z",
1030
+ "iopub.status.idle": "2023-07-08T07:31:20.383711Z",
1031
+ "shell.execute_reply": "2023-07-08T07:31:20.382696Z",
1032
+ "shell.execute_reply.started": "2023-07-08T07:31:19.523696Z"
1033
+ }
1034
+ },
1035
+ "outputs": [],
1036
+ "source": [
1037
+ "# !rm -rf /notebooks/9wimu9/sinhala-bert-1"
1038
+ ]
1039
+ },
1040
+ {
1041
+ "cell_type": "code",
1042
+ "execution_count": null,
1043
+ "id": "b8bd0ab4-6412-4c0c-a215-a0c5cd5d4626",
1044
+ "metadata": {
1045
+ "execution": {
1046
+ "iopub.execute_input": "2023-07-10T15:10:17.837648Z",
1047
+ "iopub.status.busy": "2023-07-10T15:10:17.837138Z"
1048
+ }
1049
+ },
1050
+ "outputs": [
1051
+ {
1052
+ "data": {
1053
+ "text/html": [
1054
+ "Changes to your `wandb` environment variables will be ignored because your `wandb` session has already started. For more information on how to modify your settings with `wandb.init()` arguments, please refer to <a href='https://wandb.me/wandb-init' target=\"_blank\">the W&B docs</a>."
1055
+ ],
1056
+ "text/plain": [
1057
+ "<IPython.core.display.HTML object>"
1058
+ ]
1059
+ },
1060
+ "metadata": {},
1061
+ "output_type": "display_data"
1062
+ },
1063
+ {
1064
+ "name": "stderr",
1065
+ "output_type": "stream",
1066
+ "text": [
1067
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m9wimu9\u001b[0m (\u001b[33msinquad\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
1068
+ ]
1069
+ },
1070
+ {
1071
+ "data": {
1072
+ "text/html": [
1073
+ "Tracking run with wandb version 0.15.5"
1074
+ ],
1075
+ "text/plain": [
1076
+ "<IPython.core.display.HTML object>"
1077
+ ]
1078
+ },
1079
+ "metadata": {},
1080
+ "output_type": "display_data"
1081
+ },
1082
+ {
1083
+ "data": {
1084
+ "text/html": [
1085
+ "Run data is saved locally in <code>/notebooks/wandb/run-20230710_151033-wsjuqghz</code>"
1086
+ ],
1087
+ "text/plain": [
1088
+ "<IPython.core.display.HTML object>"
1089
+ ]
1090
+ },
1091
+ "metadata": {},
1092
+ "output_type": "display_data"
1093
+ },
1094
+ {
1095
+ "data": {
1096
+ "text/html": [
1097
+ "Syncing run <strong><a href='https://wandb.ai/sinquad/sinhala_bert_v1.2/runs/wsjuqghz' target=\"_blank\">classic-eon-6</a></strong> to <a href='https://wandb.ai/sinquad/sinhala_bert_v1.2' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
1098
+ ],
1099
+ "text/plain": [
1100
+ "<IPython.core.display.HTML object>"
1101
+ ]
1102
+ },
1103
+ "metadata": {},
1104
+ "output_type": "display_data"
1105
+ },
1106
+ {
1107
+ "data": {
1108
+ "text/html": [
1109
+ " View project at <a href='https://wandb.ai/sinquad/sinhala_bert_v1.2' target=\"_blank\">https://wandb.ai/sinquad/sinhala_bert_v1.2</a>"
1110
+ ],
1111
+ "text/plain": [
1112
+ "<IPython.core.display.HTML object>"
1113
+ ]
1114
+ },
1115
+ "metadata": {},
1116
+ "output_type": "display_data"
1117
+ },
1118
+ {
1119
+ "data": {
1120
+ "text/html": [
1121
+ " View run at <a href='https://wandb.ai/sinquad/sinhala_bert_v1.2/runs/wsjuqghz' target=\"_blank\">https://wandb.ai/sinquad/sinhala_bert_v1.2/runs/wsjuqghz</a>"
1122
+ ],
1123
+ "text/plain": [
1124
+ "<IPython.core.display.HTML object>"
1125
+ ]
1126
+ },
1127
+ "metadata": {},
1128
+ "output_type": "display_data"
1129
+ },
1130
+ {
1131
+ "name": "stderr",
1132
+ "output_type": "stream",
1133
+ "text": [
1134
+ "You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
1135
+ ]
1136
+ },
1137
+ {
1138
+ "data": {
1139
+ "text/html": [
1140
+ "\n",
1141
+ " <div>\n",
1142
+ " \n",
1143
+ " <progress value='1638' max='1827' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
1144
+ " [1638/1827 1:55:38 < 3:16:54, 0.02 it/s, Epoch 0.90/1]\n",
1145
+ " </div>\n",
1146
+ " <table border=\"1\" class=\"dataframe\">\n",
1147
+ " <thead>\n",
1148
+ " <tr style=\"text-align: left;\">\n",
1149
+ " <th>Epoch</th>\n",
1150
+ " <th>Training Loss</th>\n",
1151
+ " <th>Validation Loss</th>\n",
1152
+ " </tr>\n",
1153
+ " </thead>\n",
1154
+ " <tbody>\n",
1155
+ " </tbody>\n",
1156
+ "</table><p>"
1157
+ ],
1158
+ "text/plain": [
1159
+ "<IPython.core.display.HTML object>"
1160
+ ]
1161
+ },
1162
+ "metadata": {},
1163
+ "output_type": "display_data"
1164
+ }
1165
+ ],
1166
+ "source": [
1167
+ "trainer.train(resume_from_checkpoint = True)\n",
1168
+ "wandb.finish()"
1169
+ ]
1170
+ },
1171
+ {
1172
+ "cell_type": "code",
1173
+ "execution_count": 22,
1174
+ "id": "bbf22bea-7026-42c9-a643-ba65ab8cdbff",
1175
+ "metadata": {
1176
+ "execution": {
1177
+ "iopub.execute_input": "2023-07-10T18:26:14.038132Z",
1178
+ "iopub.status.busy": "2023-07-10T18:26:14.037456Z",
1179
+ "iopub.status.idle": "2023-07-10T18:57:49.712287Z",
1180
+ "shell.execute_reply": "2023-07-10T18:57:49.711640Z",
1181
+ "shell.execute_reply.started": "2023-07-10T18:26:14.038103Z"
1182
+ }
1183
+ },
1184
+ "outputs": [
1185
+ {
1186
+ "data": {
1187
+ "text/html": [
1188
+ "\n",
1189
+ " <div>\n",
1190
+ " \n",
1191
+ " <progress value='50731' max='50731' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
1192
+ " [50731/50731 31:35]\n",
1193
+ " </div>\n",
1194
+ " "
1195
+ ],
1196
+ "text/plain": [
1197
+ "<IPython.core.display.HTML object>"
1198
+ ]
1199
+ },
1200
+ "metadata": {},
1201
+ "output_type": "display_data"
1202
+ },
1203
+ {
1204
+ "name": "stdout",
1205
+ "output_type": "stream",
1206
+ "text": [
1207
+ "Perplexity: 78.33\n"
1208
+ ]
1209
+ }
1210
+ ],
1211
+ "source": [
1212
+ "eval_results = trainer.evaluate()\n",
1213
+ "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
1214
+ ]
1215
+ },
1216
+ {
1217
+ "cell_type": "code",
1218
+ "execution_count": 23,
1219
+ "id": "f04eaadd-a13d-4651-ad14-91bcc01f92e1",
1220
+ "metadata": {
1221
+ "execution": {
1222
+ "iopub.execute_input": "2023-07-10T18:58:07.077477Z",
1223
+ "iopub.status.busy": "2023-07-10T18:58:07.076979Z",
1224
+ "iopub.status.idle": "2023-07-10T18:58:07.082203Z",
1225
+ "shell.execute_reply": "2023-07-10T18:58:07.081426Z",
1226
+ "shell.execute_reply.started": "2023-07-10T18:58:07.077454Z"
1227
+ }
1228
+ },
1229
+ "outputs": [
1230
+ {
1231
+ "data": {
1232
+ "text/plain": [
1233
+ "{'eval_loss': 4.360935211181641,\n",
1234
+ " 'eval_runtime': 1895.6573,\n",
1235
+ " 'eval_samples_per_second': 214.09,\n",
1236
+ " 'eval_steps_per_second': 26.762}"
1237
+ ]
1238
+ },
1239
+ "execution_count": 23,
1240
+ "metadata": {},
1241
+ "output_type": "execute_result"
1242
+ }
1243
+ ],
1244
+ "source": [
1245
+ "eval_results"
1246
+ ]
1247
+ },
1248
+ {
1249
+ "cell_type": "code",
1250
+ "execution_count": 25,
1251
+ "id": "d3417a50-f0a7-4cd7-bc3b-14106660be58",
1252
+ "metadata": {
1253
+ "execution": {
1254
+ "iopub.execute_input": "2023-07-10T18:58:52.507374Z",
1255
+ "iopub.status.busy": "2023-07-10T18:58:52.506748Z",
1256
+ "iopub.status.idle": "2023-07-10T18:58:53.770508Z",
1257
+ "shell.execute_reply": "2023-07-10T18:58:53.769992Z",
1258
+ "shell.execute_reply.started": "2023-07-10T18:58:52.507341Z"
1259
+ }
1260
+ },
1261
+ "outputs": [],
1262
+ "source": [
1263
+ "trainer.save_model(\"path_to_save\")"
1264
+ ]
1265
+ },
1266
+ {
1267
+ "cell_type": "code",
1268
+ "execution_count": 26,
1269
+ "id": "6a3b42de-552c-41fc-a454-afe8a0bf567d",
1270
+ "metadata": {
1271
+ "execution": {
1272
+ "iopub.execute_input": "2023-07-10T18:59:46.871782Z",
1273
+ "iopub.status.busy": "2023-07-10T18:59:46.871272Z",
1274
+ "iopub.status.idle": "2023-07-10T18:59:49.794057Z",
1275
+ "shell.execute_reply": "2023-07-10T18:59:49.793583Z",
1276
+ "shell.execute_reply.started": "2023-07-10T18:59:46.871761Z"
1277
+ }
1278
+ },
1279
+ "outputs": [
1280
+ {
1281
+ "name": "stderr",
1282
+ "output_type": "stream",
1283
+ "text": [
1284
+ "Some weights of the model checkpoint at /notebooks/path_to_save were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']\n",
1285
+ "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
1286
+ "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
1287
+ "Some weights of RobertaModel were not initialized from the model checkpoint at /notebooks/path_to_save and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n",
1288
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
1289
+ ]
1290
+ }
1291
+ ],
1292
+ "source": [
1293
+ "from transformers import AutoModel \n",
1294
+ "model = AutoModel.from_pretrained('/notebooks/path_to_save',local_files_only=True)\n"
1295
+ ]
1296
+ },
1297
+ {
1298
+ "cell_type": "code",
1299
+ "execution_count": null,
1300
+ "id": "b6f2c49a-9a09-4949-b67f-29df6d0aa895",
1301
+ "metadata": {
1302
+ "execution": {
1303
+ "iopub.execute_input": "2023-07-10T19:01:49.192299Z",
1304
+ "iopub.status.busy": "2023-07-10T19:01:49.191549Z"
1305
+ }
1306
+ },
1307
+ "outputs": [
1308
+ {
1309
+ "data": {
1310
+ "application/vnd.jupyter.widget-view+json": {
1311
+ "model_id": "77a76086b50b43a2a0bb1cc869ef8e26",
1312
+ "version_major": 2,
1313
+ "version_minor": 0
1314
+ },
1315
+ "text/plain": [
1316
+ "pytorch_model.bin: 0%| | 0.00/1.27G [00:00<?, ?B/s]"
1317
+ ]
1318
+ },
1319
+ "metadata": {},
1320
+ "output_type": "display_data"
1321
+ }
1322
+ ],
1323
+ "source": [
1324
+ "model.push_to_hub('9wimu9/sinhala-bert-1.2')"
1325
+ ]
1326
+ },
1327
+ {
1328
+ "cell_type": "code",
1329
+ "execution_count": 29,
1330
+ "id": "d4553ec7-1e38-4b44-8c5f-e46786cd3cfc",
1331
+ "metadata": {
1332
+ "execution": {
1333
+ "iopub.execute_input": "2023-07-09T13:08:15.514124Z",
1334
+ "iopub.status.busy": "2023-07-09T13:08:15.513517Z",
1335
+ "iopub.status.idle": "2023-07-09T13:08:15.918801Z",
1336
+ "shell.execute_reply": "2023-07-09T13:08:15.918326Z",
1337
+ "shell.execute_reply.started": "2023-07-09T13:08:15.514097Z"
1338
+ }
1339
+ },
1340
+ "outputs": [],
1341
+ "source": [
1342
+ "from huggingface_hub import HfApi\n",
1343
+ "api = HfApi()\n",
1344
+ "files = ['tokenizer.json','training_args.bin','trainer.ipynb']\n",
1345
+ "for file in files:\n",
1346
+ " api.upload_file(\n",
1347
+ " path_or_fileobj=\"/notebooks/path_to_save/\"+file,\n",
1348
+ " path_in_repo=file,\n",
1349
+ " repo_id=\"9wimu9/sinhala-bert-1.1\",\n",
1350
+ " repo_type=\"model\",\n",
1351
+ " )"
1352
+ ]
1353
+ },
1354
+ {
1355
+ "cell_type": "code",
1356
+ "execution_count": null,
1357
+ "id": "d1614503-df5d-454f-a81d-d96bb1899443",
1358
+ "metadata": {},
1359
+ "outputs": [],
1360
+ "source": [
1361
+ "learning rate scheduler details can be find here\n",
1362
+ "https://dev.classmethod.jp/articles/huggingface-usage-scheluder-type/"
1363
+ ]
1364
+ },
1365
+ {
1366
+ "cell_type": "code",
1367
+ "execution_count": null,
1368
+ "id": "cd184295-1c0b-4625-a516-da417beb814f",
1369
+ "metadata": {},
1370
+ "outputs": [],
1371
+ "source": [
1372
+ "bert hyper params\n",
1373
+ "======================\n",
1374
+ "β1 = 0.9,\n",
1375
+ "β2 = 0.999, \n",
1376
+ "ǫ = 1e-6\n",
1377
+ "L2 weight decay = 0.01\n",
1378
+ "learning rate = warmed up first 10k to a peak of 1e-4 then linearly decayed\n",
1379
+ "drop out 0.1\n",
1380
+ "batch size = 256\n",
1381
+ "step size = 1m\n",
1382
+ "max_token_length = 512\n",
1383
+ "\n",
1384
+ "roberta\n",
1385
+ "============\n",
1386
+ "β2 = 0.98 for lareg batch sizs\n",
1387
+ "max_token_length = 512\n",
1388
+ "batch size = 2k\n",
1389
+ "lr = 7e-4\n",
1390
+ "\n"
1391
+ ]
1392
+ }
1393
+ ],
1394
+ "metadata": {
1395
+ "kernelspec": {
1396
+ "display_name": "Python 3 (ipykernel)",
1397
+ "language": "python",
1398
+ "name": "python3"
1399
+ },
1400
+ "language_info": {
1401
+ "codemirror_mode": {
1402
+ "name": "ipython",
1403
+ "version": 3
1404
+ },
1405
+ "file_extension": ".py",
1406
+ "mimetype": "text/x-python",
1407
+ "name": "python",
1408
+ "nbconvert_exporter": "python",
1409
+ "pygments_lexer": "ipython3",
1410
+ "version": "3.9.16"
1411
+ }
1412
+ },
1413
+ "nbformat": 4,
1414
+ "nbformat_minor": 5
1415
+ }