MohamedAhmedAE commited on
Commit
b65c726
·
verified ·
1 Parent(s): 69ecb27

Training in progress, step 24000

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7935750f22a5cb64576d7248a8a5f0224040736eb89a0226837bf045912e9e1
3
  size 360740440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:687727c0c025af3bad6b25b52789d99245d33b1d1735aa8becfc6bfe71318f16
3
  size 360740440
last-checkpoint/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "k_proj",
27
- "o_proj",
28
- "v_proj",
29
  "gate_proj",
30
  "q_proj",
 
31
  "up_proj",
32
- "down_proj"
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
 
26
  "gate_proj",
27
  "q_proj",
28
+ "k_proj",
29
  "up_proj",
30
+ "v_proj",
31
+ "down_proj",
32
+ "o_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7935750f22a5cb64576d7248a8a5f0224040736eb89a0226837bf045912e9e1
3
  size 360740440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a1dd12a210017a07f561dace36dc00f59b55ff12d579593e8e5f59db1ca495d
3
  size 360740440
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11682a89e15aced12757d287f46451125b9fd73c7e942a12f559b3969bab3c6d
3
  size 184018770
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09dbea6aaf941a049c9aa3656b362f19b6c30102fcdcd2d81680e63cd278a9c4
3
  size 184018770
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d0a6938b742d8fae79b2ab06f8af88c67eff4cb6e61d15d3762e2a19932fe0d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe9c2e6cee455bb212d5ae0dd7c343acca65b7f37490224e2c429dd428b2c9ad
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d154564cf31ffc62a3bcd7ae558bf3a5056323ae93bfc5a803eed05c0150871
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06eec3ac373cc6b81236635fbbf70132276a159a47d5685288ad3485f23d8131
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.016552480055304755,
5
  "eval_steps": 500,
6
- "global_step": 23800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -28,818 +28,6 @@
28
  "learning_rate": 1.9999998496226195e-05,
29
  "loss": 2.202,
30
  "step": 600
31
- },
32
- {
33
- "epoch": 0.0005563858842119246,
34
- "grad_norm": 7.003796577453613,
35
- "learning_rate": 1.999999732662442e-05,
36
- "loss": 2.0739,
37
- "step": 800
38
- },
39
- {
40
- "epoch": 0.0006954823552649058,
41
- "grad_norm": 4.851119041442871,
42
- "learning_rate": 1.99999958228508e-05,
43
- "loss": 2.0175,
44
- "step": 1000
45
- },
46
- {
47
- "epoch": 0.0008345788263178869,
48
- "grad_norm": 6.82249116897583,
49
- "learning_rate": 1.999999398490542e-05,
50
- "loss": 2.0206,
51
- "step": 1200
52
- },
53
- {
54
- "epoch": 0.0009736752973708681,
55
- "grad_norm": 3.424342632293701,
56
- "learning_rate": 1.999999181278835e-05,
57
- "loss": 2.0587,
58
- "step": 1400
59
- },
60
- {
61
- "epoch": 0.001112771768423849,
62
- "grad_norm": 5.291999340057373,
63
- "learning_rate": 1.999998930649971e-05,
64
- "loss": 2.0822,
65
- "step": 1600
66
- },
67
- {
68
- "epoch": 0.0012518682394768305,
69
- "grad_norm": 5.283050537109375,
70
- "learning_rate": 1.9999986466039608e-05,
71
- "loss": 2.0833,
72
- "step": 1800
73
- },
74
- {
75
- "epoch": 0.0013909647105298116,
76
- "grad_norm": 5.604334354400635,
77
- "learning_rate": 1.999998329140819e-05,
78
- "loss": 2.1159,
79
- "step": 2000
80
- },
81
- {
82
- "epoch": 0.0015300611815827927,
83
- "grad_norm": 4.539398670196533,
84
- "learning_rate": 1.99999797826056e-05,
85
- "loss": 2.0815,
86
- "step": 2200
87
- },
88
- {
89
- "epoch": 0.0016691576526357739,
90
- "grad_norm": 5.886259078979492,
91
- "learning_rate": 1.9999975939632005e-05,
92
- "loss": 2.0135,
93
- "step": 2400
94
- },
95
- {
96
- "epoch": 0.001808254123688755,
97
- "grad_norm": 5.076303482055664,
98
- "learning_rate": 1.9999971762487593e-05,
99
- "loss": 2.0704,
100
- "step": 2600
101
- },
102
- {
103
- "epoch": 0.0019473505947417361,
104
- "grad_norm": 5.898621559143066,
105
- "learning_rate": 1.999996725117256e-05,
106
- "loss": 2.0405,
107
- "step": 2800
108
- },
109
- {
110
- "epoch": 0.0020864470657947173,
111
- "grad_norm": 4.124399662017822,
112
- "learning_rate": 1.9999962405687122e-05,
113
- "loss": 1.9888,
114
- "step": 3000
115
- },
116
- {
117
- "epoch": 0.002225543536847698,
118
- "grad_norm": 5.221329689025879,
119
- "learning_rate": 1.9999957226031515e-05,
120
- "loss": 2.0433,
121
- "step": 3200
122
- },
123
- {
124
- "epoch": 0.0023646400079006796,
125
- "grad_norm": 5.066499710083008,
126
- "learning_rate": 1.9999951712205977e-05,
127
- "loss": 1.9347,
128
- "step": 3400
129
- },
130
- {
131
- "epoch": 0.002503736478953661,
132
- "grad_norm": 5.852139949798584,
133
- "learning_rate": 1.9999945864210777e-05,
134
- "loss": 2.0016,
135
- "step": 3600
136
- },
137
- {
138
- "epoch": 0.002642832950006642,
139
- "grad_norm": 3.3446638584136963,
140
- "learning_rate": 1.999993968204619e-05,
141
- "loss": 2.0887,
142
- "step": 3800
143
- },
144
- {
145
- "epoch": 0.002781929421059623,
146
- "grad_norm": 4.233782768249512,
147
- "learning_rate": 1.9999933165712525e-05,
148
- "loss": 2.0563,
149
- "step": 4000
150
- },
151
- {
152
- "epoch": 0.002921025892112604,
153
- "grad_norm": 5.62469482421875,
154
- "learning_rate": 1.999992631521008e-05,
155
- "loss": 1.9745,
156
- "step": 4200
157
- },
158
- {
159
- "epoch": 0.0030601223631655855,
160
- "grad_norm": 5.743213176727295,
161
- "learning_rate": 1.999991913053918e-05,
162
- "loss": 1.9969,
163
- "step": 4400
164
- },
165
- {
166
- "epoch": 0.0031992188342185664,
167
- "grad_norm": 6.02440071105957,
168
- "learning_rate": 1.9999911611700172e-05,
169
- "loss": 1.9583,
170
- "step": 4600
171
- },
172
- {
173
- "epoch": 0.0033383153052715477,
174
- "grad_norm": 6.839730739593506,
175
- "learning_rate": 1.999990375869342e-05,
176
- "loss": 2.0131,
177
- "step": 4800
178
- },
179
- {
180
- "epoch": 0.0034774117763245287,
181
- "grad_norm": 6.269118309020996,
182
- "learning_rate": 1.9999895571519294e-05,
183
- "loss": 2.0302,
184
- "step": 5000
185
- },
186
- {
187
- "epoch": 0.00361650824737751,
188
- "grad_norm": 5.001865863800049,
189
- "learning_rate": 1.9999887050178187e-05,
190
- "loss": 1.946,
191
- "step": 5200
192
- },
193
- {
194
- "epoch": 0.003755604718430491,
195
- "grad_norm": 4.900404930114746,
196
- "learning_rate": 1.99998781946705e-05,
197
- "loss": 1.9367,
198
- "step": 5400
199
- },
200
- {
201
- "epoch": 0.0038947011894834723,
202
- "grad_norm": 15.09891128540039,
203
- "learning_rate": 1.9999869004996664e-05,
204
- "loss": 2.0014,
205
- "step": 5600
206
- },
207
- {
208
- "epoch": 0.004033797660536453,
209
- "grad_norm": 5.8560285568237305,
210
- "learning_rate": 1.9999859481157112e-05,
211
- "loss": 1.9926,
212
- "step": 5800
213
- },
214
- {
215
- "epoch": 0.004172894131589435,
216
- "grad_norm": 4.16021203994751,
217
- "learning_rate": 1.99998496231523e-05,
218
- "loss": 1.9833,
219
- "step": 6000
220
- },
221
- {
222
- "epoch": 0.004311990602642416,
223
- "grad_norm": 5.219597339630127,
224
- "learning_rate": 1.9999839430982698e-05,
225
- "loss": 2.0193,
226
- "step": 6200
227
- },
228
- {
229
- "epoch": 0.004451087073695396,
230
- "grad_norm": 5.076187610626221,
231
- "learning_rate": 1.9999828904648794e-05,
232
- "loss": 1.9929,
233
- "step": 6400
234
- },
235
- {
236
- "epoch": 0.004590183544748378,
237
- "grad_norm": 4.857876777648926,
238
- "learning_rate": 1.9999818044151088e-05,
239
- "loss": 2.0109,
240
- "step": 6600
241
- },
242
- {
243
- "epoch": 0.004729280015801359,
244
- "grad_norm": 3.6183769702911377,
245
- "learning_rate": 1.9999806849490103e-05,
246
- "loss": 2.019,
247
- "step": 6800
248
- },
249
- {
250
- "epoch": 0.0048683764868543405,
251
- "grad_norm": 3.081662178039551,
252
- "learning_rate": 1.9999795320666373e-05,
253
- "loss": 2.0102,
254
- "step": 7000
255
- },
256
- {
257
- "epoch": 0.005007472957907322,
258
- "grad_norm": 3.6525964736938477,
259
- "learning_rate": 1.9999783457680448e-05,
260
- "loss": 2.0319,
261
- "step": 7200
262
- },
263
- {
264
- "epoch": 0.005146569428960302,
265
- "grad_norm": 5.330043315887451,
266
- "learning_rate": 1.9999771260532886e-05,
267
- "loss": 1.9925,
268
- "step": 7400
269
- },
270
- {
271
- "epoch": 0.005285665900013284,
272
- "grad_norm": 4.449774742126465,
273
- "learning_rate": 1.9999758729224277e-05,
274
- "loss": 2.033,
275
- "step": 7600
276
- },
277
- {
278
- "epoch": 0.005424762371066265,
279
- "grad_norm": 4.431877136230469,
280
- "learning_rate": 1.9999745863755225e-05,
281
- "loss": 2.0076,
282
- "step": 7800
283
- },
284
- {
285
- "epoch": 0.005563858842119246,
286
- "grad_norm": 4.620669841766357,
287
- "learning_rate": 1.999973266412633e-05,
288
- "loss": 1.9021,
289
- "step": 8000
290
- },
291
- {
292
- "epoch": 0.005702955313172227,
293
- "grad_norm": 6.169932842254639,
294
- "learning_rate": 1.9999719130338235e-05,
295
- "loss": 1.9601,
296
- "step": 8200
297
- },
298
- {
299
- "epoch": 0.005842051784225208,
300
- "grad_norm": 5.546196460723877,
301
- "learning_rate": 1.9999705262391574e-05,
302
- "loss": 2.0071,
303
- "step": 8400
304
- },
305
- {
306
- "epoch": 0.00598114825527819,
307
- "grad_norm": 6.319796562194824,
308
- "learning_rate": 1.999969106028702e-05,
309
- "loss": 1.9881,
310
- "step": 8600
311
- },
312
- {
313
- "epoch": 0.006120244726331171,
314
- "grad_norm": 5.991957187652588,
315
- "learning_rate": 1.999967652402525e-05,
316
- "loss": 1.9871,
317
- "step": 8800
318
- },
319
- {
320
- "epoch": 0.006259341197384151,
321
- "grad_norm": 8.088932037353516,
322
- "learning_rate": 1.9999661653606947e-05,
323
- "loss": 2.0261,
324
- "step": 9000
325
- },
326
- {
327
- "epoch": 0.006398437668437133,
328
- "grad_norm": 4.159659385681152,
329
- "learning_rate": 1.9999646449032833e-05,
330
- "loss": 2.0084,
331
- "step": 9200
332
- },
333
- {
334
- "epoch": 0.006537534139490114,
335
- "grad_norm": 4.520460605621338,
336
- "learning_rate": 1.9999630910303628e-05,
337
- "loss": 1.9969,
338
- "step": 9400
339
- },
340
- {
341
- "epoch": 0.0066766306105430955,
342
- "grad_norm": 5.970917701721191,
343
- "learning_rate": 1.9999615037420078e-05,
344
- "loss": 2.0213,
345
- "step": 9600
346
- },
347
- {
348
- "epoch": 0.006815727081596076,
349
- "grad_norm": 4.681369304656982,
350
- "learning_rate": 1.9999598830382934e-05,
351
- "loss": 1.9711,
352
- "step": 9800
353
- },
354
- {
355
- "epoch": 0.006954823552649057,
356
- "grad_norm": 4.877756118774414,
357
- "learning_rate": 1.9999582289192974e-05,
358
- "loss": 1.9427,
359
- "step": 10000
360
- },
361
- {
362
- "epoch": 0.007093920023702039,
363
- "grad_norm": 6.686002254486084,
364
- "learning_rate": 1.9999565413850993e-05,
365
- "loss": 2.0468,
366
- "step": 10200
367
- },
368
- {
369
- "epoch": 0.00723301649475502,
370
- "grad_norm": 4.4461236000061035,
371
- "learning_rate": 1.9999548204357783e-05,
372
- "loss": 1.9352,
373
- "step": 10400
374
- },
375
- {
376
- "epoch": 0.007372112965808001,
377
- "grad_norm": 9.74795150756836,
378
- "learning_rate": 1.9999530660714175e-05,
379
- "loss": 1.9799,
380
- "step": 10600
381
- },
382
- {
383
- "epoch": 0.007511209436860982,
384
- "grad_norm": 7.161931037902832,
385
- "learning_rate": 1.999951278292101e-05,
386
- "loss": 2.012,
387
- "step": 10800
388
- },
389
- {
390
- "epoch": 0.007650305907913963,
391
- "grad_norm": 3.948525905609131,
392
- "learning_rate": 1.999949457097913e-05,
393
- "loss": 1.9932,
394
- "step": 11000
395
- },
396
- {
397
- "epoch": 0.007789402378966945,
398
- "grad_norm": 4.876201629638672,
399
- "learning_rate": 1.9999476024889414e-05,
400
- "loss": 1.9907,
401
- "step": 11200
402
- },
403
- {
404
- "epoch": 0.007928498850019925,
405
- "grad_norm": 6.009729862213135,
406
- "learning_rate": 1.9999457144652746e-05,
407
- "loss": 1.9347,
408
- "step": 11400
409
- },
410
- {
411
- "epoch": 0.008067595321072906,
412
- "grad_norm": 5.02329158782959,
413
- "learning_rate": 1.999943793027002e-05,
414
- "loss": 1.9714,
415
- "step": 11600
416
- },
417
- {
418
- "epoch": 0.008206691792125888,
419
- "grad_norm": 5.213016986846924,
420
- "learning_rate": 1.999941838174216e-05,
421
- "loss": 2.0812,
422
- "step": 11800
423
- },
424
- {
425
- "epoch": 0.00834578826317887,
426
- "grad_norm": 6.0479416847229,
427
- "learning_rate": 1.9999398499070103e-05,
428
- "loss": 1.9836,
429
- "step": 12000
430
- },
431
- {
432
- "epoch": 0.00848488473423185,
433
- "grad_norm": 5.763875484466553,
434
- "learning_rate": 1.9999378282254787e-05,
435
- "loss": 2.0187,
436
- "step": 12200
437
- },
438
- {
439
- "epoch": 0.008623981205284832,
440
- "grad_norm": 5.157899379730225,
441
- "learning_rate": 1.999935773129719e-05,
442
- "loss": 1.9386,
443
- "step": 12400
444
- },
445
- {
446
- "epoch": 0.008763077676337813,
447
- "grad_norm": 2.9542014598846436,
448
- "learning_rate": 1.999933684619828e-05,
449
- "loss": 2.0046,
450
- "step": 12600
451
- },
452
- {
453
- "epoch": 0.008902174147390793,
454
- "grad_norm": 5.380853176116943,
455
- "learning_rate": 1.9999315626959067e-05,
456
- "loss": 2.0102,
457
- "step": 12800
458
- },
459
- {
460
- "epoch": 0.009041270618443774,
461
- "grad_norm": 5.02028226852417,
462
- "learning_rate": 1.999929407358055e-05,
463
- "loss": 1.9841,
464
- "step": 13000
465
- },
466
- {
467
- "epoch": 0.009180367089496756,
468
- "grad_norm": 6.345952987670898,
469
- "learning_rate": 1.9999272186063767e-05,
470
- "loss": 1.9903,
471
- "step": 13200
472
- },
473
- {
474
- "epoch": 0.009319463560549737,
475
- "grad_norm": 5.001527309417725,
476
- "learning_rate": 1.9999249964409763e-05,
477
- "loss": 1.9106,
478
- "step": 13400
479
- },
480
- {
481
- "epoch": 0.009458560031602718,
482
- "grad_norm": 5.8275651931762695,
483
- "learning_rate": 1.9999227408619597e-05,
484
- "loss": 1.946,
485
- "step": 13600
486
- },
487
- {
488
- "epoch": 0.0095976565026557,
489
- "grad_norm": 4.7476396560668945,
490
- "learning_rate": 1.9999204518694348e-05,
491
- "loss": 1.99,
492
- "step": 13800
493
- },
494
- {
495
- "epoch": 0.009736752973708681,
496
- "grad_norm": 4.389928817749023,
497
- "learning_rate": 1.9999181294635103e-05,
498
- "loss": 1.9211,
499
- "step": 14000
500
- },
501
- {
502
- "epoch": 0.009875849444761662,
503
- "grad_norm": 3.244771957397461,
504
- "learning_rate": 1.9999157736442973e-05,
505
- "loss": 1.9621,
506
- "step": 14200
507
- },
508
- {
509
- "epoch": 0.010014945915814644,
510
- "grad_norm": 5.625466823577881,
511
- "learning_rate": 1.999913384411909e-05,
512
- "loss": 1.9118,
513
- "step": 14400
514
- },
515
- {
516
- "epoch": 0.010154042386867623,
517
- "grad_norm": 4.992525577545166,
518
- "learning_rate": 1.9999109617664585e-05,
519
- "loss": 1.9967,
520
- "step": 14600
521
- },
522
- {
523
- "epoch": 0.010293138857920605,
524
- "grad_norm": 4.458539962768555,
525
- "learning_rate": 1.9999085057080614e-05,
526
- "loss": 1.9181,
527
- "step": 14800
528
- },
529
- {
530
- "epoch": 0.010432235328973586,
531
- "grad_norm": 4.348485946655273,
532
- "learning_rate": 1.999906016236836e-05,
533
- "loss": 1.954,
534
- "step": 15000
535
- },
536
- {
537
- "epoch": 0.010571331800026567,
538
- "grad_norm": 6.683957099914551,
539
- "learning_rate": 1.9999034933529003e-05,
540
- "loss": 1.957,
541
- "step": 15200
542
- },
543
- {
544
- "epoch": 0.010710428271079549,
545
- "grad_norm": 4.755821704864502,
546
- "learning_rate": 1.9999009370563746e-05,
547
- "loss": 1.9285,
548
- "step": 15400
549
- },
550
- {
551
- "epoch": 0.01084952474213253,
552
- "grad_norm": 4.806936264038086,
553
- "learning_rate": 1.9998983473473816e-05,
554
- "loss": 1.9129,
555
- "step": 15600
556
- },
557
- {
558
- "epoch": 0.010988621213185511,
559
- "grad_norm": 2.8975718021392822,
560
- "learning_rate": 1.9998957242260447e-05,
561
- "loss": 1.9598,
562
- "step": 15800
563
- },
564
- {
565
- "epoch": 0.011127717684238493,
566
- "grad_norm": 4.4350409507751465,
567
- "learning_rate": 1.9998930676924887e-05,
568
- "loss": 1.9818,
569
- "step": 16000
570
- },
571
- {
572
- "epoch": 0.011266814155291472,
573
- "grad_norm": 4.06223726272583,
574
- "learning_rate": 1.999890377746841e-05,
575
- "loss": 2.0254,
576
- "step": 16200
577
- },
578
- {
579
- "epoch": 0.011405910626344454,
580
- "grad_norm": 4.433406829833984,
581
- "learning_rate": 1.9998876543892297e-05,
582
- "loss": 1.9962,
583
- "step": 16400
584
- },
585
- {
586
- "epoch": 0.011545007097397435,
587
- "grad_norm": 4.087177753448486,
588
- "learning_rate": 1.9998848976197845e-05,
589
- "loss": 1.9585,
590
- "step": 16600
591
- },
592
- {
593
- "epoch": 0.011684103568450416,
594
- "grad_norm": 5.861230850219727,
595
- "learning_rate": 1.9998821074386376e-05,
596
- "loss": 1.9837,
597
- "step": 16800
598
- },
599
- {
600
- "epoch": 0.011823200039503398,
601
- "grad_norm": 7.665340423583984,
602
- "learning_rate": 1.9998792838459226e-05,
603
- "loss": 2.0083,
604
- "step": 17000
605
- },
606
- {
607
- "epoch": 0.01196229651055638,
608
- "grad_norm": 3.315056800842285,
609
- "learning_rate": 1.9998764268417728e-05,
610
- "loss": 2.0438,
611
- "step": 17200
612
- },
613
- {
614
- "epoch": 0.01210139298160936,
615
- "grad_norm": 4.6072258949279785,
616
- "learning_rate": 1.9998735364263257e-05,
617
- "loss": 1.9738,
618
- "step": 17400
619
- },
620
- {
621
- "epoch": 0.012240489452662342,
622
- "grad_norm": 4.772326946258545,
623
- "learning_rate": 1.9998706125997193e-05,
624
- "loss": 1.9731,
625
- "step": 17600
626
- },
627
- {
628
- "epoch": 0.012379585923715323,
629
- "grad_norm": 6.146634578704834,
630
- "learning_rate": 1.9998676553620927e-05,
631
- "loss": 1.9572,
632
- "step": 17800
633
- },
634
- {
635
- "epoch": 0.012518682394768303,
636
- "grad_norm": 5.802914619445801,
637
- "learning_rate": 1.9998646647135877e-05,
638
- "loss": 2.0147,
639
- "step": 18000
640
- },
641
- {
642
- "epoch": 0.012657778865821284,
643
- "grad_norm": 5.375220775604248,
644
- "learning_rate": 1.9998616406543457e-05,
645
- "loss": 1.9244,
646
- "step": 18200
647
- },
648
- {
649
- "epoch": 0.012796875336874266,
650
- "grad_norm": 5.7547101974487305,
651
- "learning_rate": 1.999858583184513e-05,
652
- "loss": 2.0012,
653
- "step": 18400
654
- },
655
- {
656
- "epoch": 0.012935971807927247,
657
- "grad_norm": 5.727509498596191,
658
- "learning_rate": 1.9998554923042343e-05,
659
- "loss": 2.0101,
660
- "step": 18600
661
- },
662
- {
663
- "epoch": 0.013075068278980228,
664
- "grad_norm": 7.512138366699219,
665
- "learning_rate": 1.9998523680136575e-05,
666
- "loss": 1.9783,
667
- "step": 18800
668
- },
669
- {
670
- "epoch": 0.01321416475003321,
671
- "grad_norm": 5.997193813323975,
672
- "learning_rate": 1.9998492103129314e-05,
673
- "loss": 1.9289,
674
- "step": 19000
675
- },
676
- {
677
- "epoch": 0.013353261221086191,
678
- "grad_norm": 3.422043800354004,
679
- "learning_rate": 1.9998460192022073e-05,
680
- "loss": 2.039,
681
- "step": 19200
682
- },
683
- {
684
- "epoch": 0.013492357692139172,
685
- "grad_norm": 4.149094581604004,
686
- "learning_rate": 1.999842794681637e-05,
687
- "loss": 1.9124,
688
- "step": 19400
689
- },
690
- {
691
- "epoch": 0.013631454163192152,
692
- "grad_norm": 4.264293670654297,
693
- "learning_rate": 1.9998395367513753e-05,
694
- "loss": 2.0029,
695
- "step": 19600
696
- },
697
- {
698
- "epoch": 0.013770550634245133,
699
- "grad_norm": 6.137360095977783,
700
- "learning_rate": 1.9998362454115767e-05,
701
- "loss": 1.9444,
702
- "step": 19800
703
- },
704
- {
705
- "epoch": 0.013909647105298115,
706
- "grad_norm": 6.3221588134765625,
707
- "learning_rate": 1.999832920662399e-05,
708
- "loss": 2.0269,
709
- "step": 20000
710
- },
711
- {
712
- "epoch": 0.014048743576351096,
713
- "grad_norm": 6.676060676574707,
714
- "learning_rate": 1.9998295625040006e-05,
715
- "loss": 1.9565,
716
- "step": 20200
717
- },
718
- {
719
- "epoch": 0.014187840047404077,
720
- "grad_norm": 7.500442981719971,
721
- "learning_rate": 1.9998261709365422e-05,
722
- "loss": 1.9674,
723
- "step": 20400
724
- },
725
- {
726
- "epoch": 0.014326936518457059,
727
- "grad_norm": 4.544761657714844,
728
- "learning_rate": 1.9998227459601847e-05,
729
- "loss": 2.0291,
730
- "step": 20600
731
- },
732
- {
733
- "epoch": 0.01446603298951004,
734
- "grad_norm": 4.8733744621276855,
735
- "learning_rate": 1.9998192875750928e-05,
736
- "loss": 1.9198,
737
- "step": 20800
738
- },
739
- {
740
- "epoch": 0.014605129460563021,
741
- "grad_norm": 4.430062294006348,
742
- "learning_rate": 1.999815795781431e-05,
743
- "loss": 1.9795,
744
- "step": 21000
745
- },
746
- {
747
- "epoch": 0.014744225931616003,
748
- "grad_norm": 4.341994762420654,
749
- "learning_rate": 1.9998122705793667e-05,
750
- "loss": 1.9514,
751
- "step": 21200
752
- },
753
- {
754
- "epoch": 0.014883322402668982,
755
- "grad_norm": 5.507206916809082,
756
- "learning_rate": 1.9998087119690667e-05,
757
- "loss": 2.0101,
758
- "step": 21400
759
- },
760
- {
761
- "epoch": 0.015022418873721964,
762
- "grad_norm": 3.7597243785858154,
763
- "learning_rate": 1.9998051199507023e-05,
764
- "loss": 1.9755,
765
- "step": 21600
766
- },
767
- {
768
- "epoch": 0.015161515344774945,
769
- "grad_norm": 7.106645107269287,
770
- "learning_rate": 1.9998014945244445e-05,
771
- "loss": 2.0197,
772
- "step": 21800
773
- },
774
- {
775
- "epoch": 0.015300611815827926,
776
- "grad_norm": 7.3320231437683105,
777
- "learning_rate": 1.9997978356904658e-05,
778
- "loss": 1.9894,
779
- "step": 22000
780
- },
781
- {
782
- "epoch": 0.015439708286880908,
783
- "grad_norm": 5.327560901641846,
784
- "learning_rate": 1.999794143448942e-05,
785
- "loss": 1.9868,
786
- "step": 22200
787
- },
788
- {
789
- "epoch": 0.01557880475793389,
790
- "grad_norm": 6.348668098449707,
791
- "learning_rate": 1.9997904178000485e-05,
792
- "loss": 2.0534,
793
- "step": 22400
794
- },
795
- {
796
- "epoch": 0.01571790122898687,
797
- "grad_norm": 5.529504299163818,
798
- "learning_rate": 1.9997866587439633e-05,
799
- "loss": 1.9518,
800
- "step": 22600
801
- },
802
- {
803
- "epoch": 0.01585699770003985,
804
- "grad_norm": 5.461117267608643,
805
- "learning_rate": 1.9997828662808665e-05,
806
- "loss": 1.9358,
807
- "step": 22800
808
- },
809
- {
810
- "epoch": 0.015996094171092833,
811
- "grad_norm": 4.327660083770752,
812
- "learning_rate": 1.999779040410938e-05,
813
- "loss": 1.9166,
814
- "step": 23000
815
- },
816
- {
817
- "epoch": 0.016135190642145813,
818
- "grad_norm": 4.492312908172607,
819
- "learning_rate": 1.9997751811343617e-05,
820
- "loss": 2.0067,
821
- "step": 23200
822
- },
823
- {
824
- "epoch": 0.016274287113198796,
825
- "grad_norm": 4.487472057342529,
826
- "learning_rate": 1.9997712884513206e-05,
827
- "loss": 1.9651,
828
- "step": 23400
829
- },
830
- {
831
- "epoch": 0.016413383584251776,
832
- "grad_norm": 4.890496253967285,
833
- "learning_rate": 1.9997673623620018e-05,
834
- "loss": 1.952,
835
- "step": 23600
836
- },
837
- {
838
- "epoch": 0.016552480055304755,
839
- "grad_norm": 3.2920827865600586,
840
- "learning_rate": 1.9997634028665915e-05,
841
- "loss": 2.0047,
842
- "step": 23800
843
  }
844
  ],
845
  "logging_steps": 200,
@@ -859,7 +47,7 @@
859
  "attributes": {}
860
  }
861
  },
862
- "total_flos": 4.313637571029811e+16,
863
  "train_batch_size": 1,
864
  "trial_name": null,
865
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.00041728941315894347,
5
  "eval_steps": 500,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
28
  "learning_rate": 1.9999998496226195e-05,
29
  "loss": 2.202,
30
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
  ],
33
  "logging_steps": 200,
 
47
  "attributes": {}
48
  }
49
  },
50
+ "total_flos": 1138117833400320.0,
51
  "train_batch_size": 1,
52
  "trial_name": null,
53
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c312ec173d6047febee792b17559c8f2b35849f5524db67908351bf3c183864c
3
  size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f73bef2970b56da564b1d8c87d27fe806335e746653d451535ecd6b817d641ba
3
  size 6840