mosama commited on
Commit
a096e9e
·
verified ·
1 Parent(s): 3c6c388

Training in progress, step 1800, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ac0520099b00fa8abb36aeca314ee7d832cf4b61bf801a2c30516aa851fb82b
3
  size 1370666272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c19a8c4ff78657e1ad6849d03827d74a166332e2c92b5ee4c34966f79e091caa
3
  size 1370666272
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae58219350ead23fe4928d07a04438a2ecd5b7d63f45709bb1f9132f8f69dece
3
  size 697294462
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10c0470d53e83293b301fcaf8b6ed1125194ec8f54fe9618703c1367bf9a41e7
3
  size 697294462
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3c94e5a55e06b7a613c8f5f874916b5625507d7637367c63c55ddfb15995bfd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d290a2c62404485bacce37c57039bbf078af94d6cf0884b19d2a678f11aec096
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.4061738424045491,
5
  "eval_steps": 500,
6
- "global_step": 1750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -12257,6 +12257,356 @@
12257
  "learning_rate": 0.0001967750823269455,
12258
  "loss": 0.8951,
12259
  "step": 1750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12260
  }
12261
  ],
12262
  "logging_steps": 1,
@@ -12276,7 +12626,7 @@
12276
  "attributes": {}
12277
  }
12278
  },
12279
- "total_flos": 7.76784253550592e+17,
12280
  "train_batch_size": 32,
12281
  "trial_name": null,
12282
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.4177788093303934,
5
  "eval_steps": 500,
6
+ "global_step": 1800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
12257
  "learning_rate": 0.0001967750823269455,
12258
  "loss": 0.8951,
12259
  "step": 1750
12260
+ },
12261
+ {
12262
+ "epoch": 0.40640594174306605,
12263
+ "grad_norm": 0.6410776972770691,
12264
+ "learning_rate": 0.00019677140721822734,
12265
+ "loss": 0.8931,
12266
+ "step": 1751
12267
+ },
12268
+ {
12269
+ "epoch": 0.4066380410815829,
12270
+ "grad_norm": 0.7138279676437378,
12271
+ "learning_rate": 0.00019676773005098766,
12272
+ "loss": 0.9195,
12273
+ "step": 1752
12274
+ },
12275
+ {
12276
+ "epoch": 0.4068701404200998,
12277
+ "grad_norm": 0.5348248481750488,
12278
+ "learning_rate": 0.00019676405082530476,
12279
+ "loss": 0.9232,
12280
+ "step": 1753
12281
+ },
12282
+ {
12283
+ "epoch": 0.4071022397586167,
12284
+ "grad_norm": 0.6424762010574341,
12285
+ "learning_rate": 0.00019676036954125684,
12286
+ "loss": 0.9068,
12287
+ "step": 1754
12288
+ },
12289
+ {
12290
+ "epoch": 0.40733433909713357,
12291
+ "grad_norm": 0.6323909759521484,
12292
+ "learning_rate": 0.00019675668619892228,
12293
+ "loss": 0.929,
12294
+ "step": 1755
12295
+ },
12296
+ {
12297
+ "epoch": 0.40756643843565044,
12298
+ "grad_norm": 0.541854977607727,
12299
+ "learning_rate": 0.00019675300079837935,
12300
+ "loss": 0.9729,
12301
+ "step": 1756
12302
+ },
12303
+ {
12304
+ "epoch": 0.40779853777416736,
12305
+ "grad_norm": 0.7092952132225037,
12306
+ "learning_rate": 0.00019674931333970647,
12307
+ "loss": 0.9005,
12308
+ "step": 1757
12309
+ },
12310
+ {
12311
+ "epoch": 0.4080306371126842,
12312
+ "grad_norm": 0.5437161922454834,
12313
+ "learning_rate": 0.0001967456238229821,
12314
+ "loss": 0.9944,
12315
+ "step": 1758
12316
+ },
12317
+ {
12318
+ "epoch": 0.4082627364512011,
12319
+ "grad_norm": 0.5253750681877136,
12320
+ "learning_rate": 0.00019674193224828473,
12321
+ "loss": 0.9535,
12322
+ "step": 1759
12323
+ },
12324
+ {
12325
+ "epoch": 0.408494835789718,
12326
+ "grad_norm": 0.6329406499862671,
12327
+ "learning_rate": 0.00019673823861569286,
12328
+ "loss": 0.8904,
12329
+ "step": 1760
12330
+ },
12331
+ {
12332
+ "epoch": 0.4087269351282349,
12333
+ "grad_norm": 0.5530345439910889,
12334
+ "learning_rate": 0.00019673454292528508,
12335
+ "loss": 0.934,
12336
+ "step": 1761
12337
+ },
12338
+ {
12339
+ "epoch": 0.40895903446675175,
12340
+ "grad_norm": 0.6421835422515869,
12341
+ "learning_rate": 0.00019673084517714,
12342
+ "loss": 0.9014,
12343
+ "step": 1762
12344
+ },
12345
+ {
12346
+ "epoch": 0.40919113380526867,
12347
+ "grad_norm": 0.5271580219268799,
12348
+ "learning_rate": 0.00019672714537133628,
12349
+ "loss": 0.923,
12350
+ "step": 1763
12351
+ },
12352
+ {
12353
+ "epoch": 0.40942323314378554,
12354
+ "grad_norm": 0.5356336236000061,
12355
+ "learning_rate": 0.00019672344350795258,
12356
+ "loss": 0.9246,
12357
+ "step": 1764
12358
+ },
12359
+ {
12360
+ "epoch": 0.4096553324823024,
12361
+ "grad_norm": 0.6168617606163025,
12362
+ "learning_rate": 0.0001967197395870677,
12363
+ "loss": 0.923,
12364
+ "step": 1765
12365
+ },
12366
+ {
12367
+ "epoch": 0.4098874318208193,
12368
+ "grad_norm": 0.49557581543922424,
12369
+ "learning_rate": 0.00019671603360876043,
12370
+ "loss": 0.9448,
12371
+ "step": 1766
12372
+ },
12373
+ {
12374
+ "epoch": 0.4101195311593362,
12375
+ "grad_norm": 0.5493084192276001,
12376
+ "learning_rate": 0.00019671232557310958,
12377
+ "loss": 0.9362,
12378
+ "step": 1767
12379
+ },
12380
+ {
12381
+ "epoch": 0.41035163049785306,
12382
+ "grad_norm": 0.6057862639427185,
12383
+ "learning_rate": 0.00019670861548019405,
12384
+ "loss": 0.9443,
12385
+ "step": 1768
12386
+ },
12387
+ {
12388
+ "epoch": 0.41058372983637,
12389
+ "grad_norm": 0.5347152948379517,
12390
+ "learning_rate": 0.0001967049033300927,
12391
+ "loss": 0.9054,
12392
+ "step": 1769
12393
+ },
12394
+ {
12395
+ "epoch": 0.41081582917488685,
12396
+ "grad_norm": 0.5570089817047119,
12397
+ "learning_rate": 0.0001967011891228846,
12398
+ "loss": 0.9094,
12399
+ "step": 1770
12400
+ },
12401
+ {
12402
+ "epoch": 0.4110479285134037,
12403
+ "grad_norm": 0.5425180792808533,
12404
+ "learning_rate": 0.00019669747285864863,
12405
+ "loss": 0.9072,
12406
+ "step": 1771
12407
+ },
12408
+ {
12409
+ "epoch": 0.41128002785192064,
12410
+ "grad_norm": 0.5784744024276733,
12411
+ "learning_rate": 0.00019669375453746396,
12412
+ "loss": 1.0027,
12413
+ "step": 1772
12414
+ },
12415
+ {
12416
+ "epoch": 0.4115121271904375,
12417
+ "grad_norm": 0.6552026867866516,
12418
+ "learning_rate": 0.0001966900341594096,
12419
+ "loss": 0.9353,
12420
+ "step": 1773
12421
+ },
12422
+ {
12423
+ "epoch": 0.41174422652895437,
12424
+ "grad_norm": 0.4845140874385834,
12425
+ "learning_rate": 0.0001966863117245648,
12426
+ "loss": 0.9227,
12427
+ "step": 1774
12428
+ },
12429
+ {
12430
+ "epoch": 0.4119763258674713,
12431
+ "grad_norm": 0.5522558689117432,
12432
+ "learning_rate": 0.0001966825872330086,
12433
+ "loss": 0.9159,
12434
+ "step": 1775
12435
+ },
12436
+ {
12437
+ "epoch": 0.41220842520598816,
12438
+ "grad_norm": 0.6886111497879028,
12439
+ "learning_rate": 0.0001966788606848203,
12440
+ "loss": 0.8733,
12441
+ "step": 1776
12442
+ },
12443
+ {
12444
+ "epoch": 0.412440524544505,
12445
+ "grad_norm": 0.5358473062515259,
12446
+ "learning_rate": 0.0001966751320800792,
12447
+ "loss": 0.8916,
12448
+ "step": 1777
12449
+ },
12450
+ {
12451
+ "epoch": 0.41267262388302195,
12452
+ "grad_norm": 0.574971079826355,
12453
+ "learning_rate": 0.0001966714014188646,
12454
+ "loss": 0.8828,
12455
+ "step": 1778
12456
+ },
12457
+ {
12458
+ "epoch": 0.4129047232215388,
12459
+ "grad_norm": 0.5384320616722107,
12460
+ "learning_rate": 0.0001966676687012558,
12461
+ "loss": 0.8881,
12462
+ "step": 1779
12463
+ },
12464
+ {
12465
+ "epoch": 0.4131368225600557,
12466
+ "grad_norm": 0.6178532838821411,
12467
+ "learning_rate": 0.00019666393392733228,
12468
+ "loss": 0.9724,
12469
+ "step": 1780
12470
+ },
12471
+ {
12472
+ "epoch": 0.4133689218985726,
12473
+ "grad_norm": 0.5532113313674927,
12474
+ "learning_rate": 0.00019666019709717344,
12475
+ "loss": 0.9535,
12476
+ "step": 1781
12477
+ },
12478
+ {
12479
+ "epoch": 0.4136010212370895,
12480
+ "grad_norm": 0.5668889880180359,
12481
+ "learning_rate": 0.00019665645821085876,
12482
+ "loss": 0.9127,
12483
+ "step": 1782
12484
+ },
12485
+ {
12486
+ "epoch": 0.41383312057560634,
12487
+ "grad_norm": 0.5764045715332031,
12488
+ "learning_rate": 0.00019665271726846783,
12489
+ "loss": 0.9412,
12490
+ "step": 1783
12491
+ },
12492
+ {
12493
+ "epoch": 0.41406521991412326,
12494
+ "grad_norm": 0.5341030955314636,
12495
+ "learning_rate": 0.00019664897427008014,
12496
+ "loss": 0.9349,
12497
+ "step": 1784
12498
+ },
12499
+ {
12500
+ "epoch": 0.41429731925264013,
12501
+ "grad_norm": 0.6231575012207031,
12502
+ "learning_rate": 0.00019664522921577544,
12503
+ "loss": 0.8928,
12504
+ "step": 1785
12505
+ },
12506
+ {
12507
+ "epoch": 0.414529418591157,
12508
+ "grad_norm": 0.5901029706001282,
12509
+ "learning_rate": 0.00019664148210563328,
12510
+ "loss": 0.9054,
12511
+ "step": 1786
12512
+ },
12513
+ {
12514
+ "epoch": 0.4147615179296739,
12515
+ "grad_norm": 0.5409894585609436,
12516
+ "learning_rate": 0.0001966377329397334,
12517
+ "loss": 0.8859,
12518
+ "step": 1787
12519
+ },
12520
+ {
12521
+ "epoch": 0.4149936172681908,
12522
+ "grad_norm": 0.6134136915206909,
12523
+ "learning_rate": 0.00019663398171815554,
12524
+ "loss": 0.8984,
12525
+ "step": 1788
12526
+ },
12527
+ {
12528
+ "epoch": 0.41522571660670765,
12529
+ "grad_norm": 0.5341612696647644,
12530
+ "learning_rate": 0.00019663022844097956,
12531
+ "loss": 0.8723,
12532
+ "step": 1789
12533
+ },
12534
+ {
12535
+ "epoch": 0.4154578159452246,
12536
+ "grad_norm": 0.5658878684043884,
12537
+ "learning_rate": 0.00019662647310828523,
12538
+ "loss": 0.8559,
12539
+ "step": 1790
12540
+ },
12541
+ {
12542
+ "epoch": 0.41568991528374144,
12543
+ "grad_norm": 0.49125760793685913,
12544
+ "learning_rate": 0.00019662271572015247,
12545
+ "loss": 0.8786,
12546
+ "step": 1791
12547
+ },
12548
+ {
12549
+ "epoch": 0.4159220146222583,
12550
+ "grad_norm": 0.6301273703575134,
12551
+ "learning_rate": 0.00019661895627666115,
12552
+ "loss": 0.8943,
12553
+ "step": 1792
12554
+ },
12555
+ {
12556
+ "epoch": 0.41615411396077523,
12557
+ "grad_norm": 0.5750293731689453,
12558
+ "learning_rate": 0.00019661519477789135,
12559
+ "loss": 0.8957,
12560
+ "step": 1793
12561
+ },
12562
+ {
12563
+ "epoch": 0.4163862132992921,
12564
+ "grad_norm": 0.5299922823905945,
12565
+ "learning_rate": 0.000196611431223923,
12566
+ "loss": 0.963,
12567
+ "step": 1794
12568
+ },
12569
+ {
12570
+ "epoch": 0.41661831263780896,
12571
+ "grad_norm": 0.7080173492431641,
12572
+ "learning_rate": 0.00019660766561483618,
12573
+ "loss": 0.9599,
12574
+ "step": 1795
12575
+ },
12576
+ {
12577
+ "epoch": 0.4168504119763259,
12578
+ "grad_norm": 0.5741339921951294,
12579
+ "learning_rate": 0.00019660389795071097,
12580
+ "loss": 0.941,
12581
+ "step": 1796
12582
+ },
12583
+ {
12584
+ "epoch": 0.41708251131484275,
12585
+ "grad_norm": 0.5957292318344116,
12586
+ "learning_rate": 0.00019660012823162755,
12587
+ "loss": 0.91,
12588
+ "step": 1797
12589
+ },
12590
+ {
12591
+ "epoch": 0.4173146106533596,
12592
+ "grad_norm": 0.5832741856575012,
12593
+ "learning_rate": 0.0001965963564576661,
12594
+ "loss": 0.9038,
12595
+ "step": 1798
12596
+ },
12597
+ {
12598
+ "epoch": 0.41754670999187654,
12599
+ "grad_norm": 0.613530158996582,
12600
+ "learning_rate": 0.00019659258262890683,
12601
+ "loss": 0.9392,
12602
+ "step": 1799
12603
+ },
12604
+ {
12605
+ "epoch": 0.4177788093303934,
12606
+ "grad_norm": 0.5645830631256104,
12607
+ "learning_rate": 0.00019658880674543004,
12608
+ "loss": 0.9393,
12609
+ "step": 1800
12610
  }
12611
  ],
12612
  "logging_steps": 1,
 
12626
  "attributes": {}
12627
  }
12628
  },
12629
+ "total_flos": 7.989780893663232e+17,
12630
  "train_batch_size": 32,
12631
  "trial_name": null,
12632
  "trial_params": null