jdannem6 commited on
Commit
80edba6
·
verified ·
1 Parent(s): 648c0e7

Uploaded checkpoint-30000

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1795 -5
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:139f4435bd5729787408c5615feba1fe9895d2e7bd0f5d89d9346fdd1ac87574
3
  size 119975656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d6ba029e80055d962d5591d214007a0348eb30f7919c676c6fbc3118fd3b606
3
  size 119975656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:018c500e950751508e9ad4d41e38708e2cf3bf2e66584879f67e33a3a31f85fa
3
  size 240145026
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cf8943c697998360c8806827f24efee03b88ed48084dbd4d34f651d3d519a00
3
  size 240145026
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09598c5aaf58b4976874761472c065fd04118ddf756eeecbee75a092b841ae97
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdf2cde96453a2a4e778e1a011567881c16f49aa6c0056bd6e7ac168a68fe3c2
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30c1646ffd4f2e4e86a7c5c87af0949f3be46b7539d2a0137b1bb01bf3e8bbe5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992f92f28f32913b10be7c822cdebf0abc4183b682b29981995e05a2a28bcd4d
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.7507393956184387,
3
- "best_model_checkpoint": "runs/deepseek_lora_20240423-223943/checkpoint-7500",
4
- "epoch": 0.1875,
5
  "eval_steps": 500,
6
- "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5377,6 +5377,1796 @@
5377
  "eval_samples_per_second": 14.734,
5378
  "eval_steps_per_second": 14.734,
5379
  "step": 7500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5380
  }
5381
  ],
5382
  "logging_steps": 10,
@@ -5384,7 +7174,7 @@
5384
  "num_input_tokens_seen": 0,
5385
  "num_train_epochs": 1,
5386
  "save_steps": 2500,
5387
- "total_flos": 1.2076594495488e+17,
5388
  "train_batch_size": 1,
5389
  "trial_name": null,
5390
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7402730584144592,
3
+ "best_model_checkpoint": "runs/deepseek_lora_20240423-223943/checkpoint-10000",
4
+ "epoch": 0.25,
5
  "eval_steps": 500,
6
+ "global_step": 10000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5377
  "eval_samples_per_second": 14.734,
5378
  "eval_steps_per_second": 14.734,
5379
  "step": 7500
5380
+ },
5381
+ {
5382
+ "epoch": 0.19,
5383
+ "grad_norm": 10.315424919128418,
5384
+ "learning_rate": 5.242105263157895e-06,
5385
+ "loss": 0.736,
5386
+ "step": 7510
5387
+ },
5388
+ {
5389
+ "epoch": 0.19,
5390
+ "grad_norm": 3.440877676010132,
5391
+ "learning_rate": 5.2210526315789475e-06,
5392
+ "loss": 0.799,
5393
+ "step": 7520
5394
+ },
5395
+ {
5396
+ "epoch": 0.19,
5397
+ "grad_norm": 2.361064910888672,
5398
+ "learning_rate": 5.2e-06,
5399
+ "loss": 0.832,
5400
+ "step": 7530
5401
+ },
5402
+ {
5403
+ "epoch": 0.19,
5404
+ "grad_norm": 2.1224961280822754,
5405
+ "learning_rate": 5.178947368421054e-06,
5406
+ "loss": 0.7118,
5407
+ "step": 7540
5408
+ },
5409
+ {
5410
+ "epoch": 0.19,
5411
+ "grad_norm": 4.9322614669799805,
5412
+ "learning_rate": 5.157894736842106e-06,
5413
+ "loss": 0.6614,
5414
+ "step": 7550
5415
+ },
5416
+ {
5417
+ "epoch": 0.19,
5418
+ "grad_norm": 4.812900066375732,
5419
+ "learning_rate": 5.136842105263158e-06,
5420
+ "loss": 0.8002,
5421
+ "step": 7560
5422
+ },
5423
+ {
5424
+ "epoch": 0.19,
5425
+ "grad_norm": 6.411820411682129,
5426
+ "learning_rate": 5.115789473684211e-06,
5427
+ "loss": 0.835,
5428
+ "step": 7570
5429
+ },
5430
+ {
5431
+ "epoch": 0.19,
5432
+ "grad_norm": 5.406981468200684,
5433
+ "learning_rate": 5.0947368421052635e-06,
5434
+ "loss": 0.8384,
5435
+ "step": 7580
5436
+ },
5437
+ {
5438
+ "epoch": 0.19,
5439
+ "grad_norm": 4.32007360458374,
5440
+ "learning_rate": 5.073684210526316e-06,
5441
+ "loss": 0.5798,
5442
+ "step": 7590
5443
+ },
5444
+ {
5445
+ "epoch": 0.19,
5446
+ "grad_norm": 4.640589714050293,
5447
+ "learning_rate": 5.052631578947369e-06,
5448
+ "loss": 0.7896,
5449
+ "step": 7600
5450
+ },
5451
+ {
5452
+ "epoch": 0.19,
5453
+ "grad_norm": 5.4717936515808105,
5454
+ "learning_rate": 5.0315789473684214e-06,
5455
+ "loss": 0.7829,
5456
+ "step": 7610
5457
+ },
5458
+ {
5459
+ "epoch": 0.19,
5460
+ "grad_norm": 2.995558261871338,
5461
+ "learning_rate": 5.010526315789475e-06,
5462
+ "loss": 0.7322,
5463
+ "step": 7620
5464
+ },
5465
+ {
5466
+ "epoch": 0.19,
5467
+ "grad_norm": 2.5911152362823486,
5468
+ "learning_rate": 4.989473684210527e-06,
5469
+ "loss": 0.7727,
5470
+ "step": 7630
5471
+ },
5472
+ {
5473
+ "epoch": 0.19,
5474
+ "grad_norm": 3.3829457759857178,
5475
+ "learning_rate": 4.968421052631579e-06,
5476
+ "loss": 0.7178,
5477
+ "step": 7640
5478
+ },
5479
+ {
5480
+ "epoch": 0.19,
5481
+ "grad_norm": 5.157157897949219,
5482
+ "learning_rate": 4.947368421052632e-06,
5483
+ "loss": 0.7241,
5484
+ "step": 7650
5485
+ },
5486
+ {
5487
+ "epoch": 0.19,
5488
+ "grad_norm": 6.205902099609375,
5489
+ "learning_rate": 4.926315789473685e-06,
5490
+ "loss": 0.7831,
5491
+ "step": 7660
5492
+ },
5493
+ {
5494
+ "epoch": 0.19,
5495
+ "grad_norm": 3.92594051361084,
5496
+ "learning_rate": 4.905263157894737e-06,
5497
+ "loss": 0.8057,
5498
+ "step": 7670
5499
+ },
5500
+ {
5501
+ "epoch": 0.19,
5502
+ "grad_norm": 4.578032493591309,
5503
+ "learning_rate": 4.88421052631579e-06,
5504
+ "loss": 0.8011,
5505
+ "step": 7680
5506
+ },
5507
+ {
5508
+ "epoch": 0.19,
5509
+ "grad_norm": 6.8539605140686035,
5510
+ "learning_rate": 4.863157894736843e-06,
5511
+ "loss": 0.7792,
5512
+ "step": 7690
5513
+ },
5514
+ {
5515
+ "epoch": 0.19,
5516
+ "grad_norm": 7.954685211181641,
5517
+ "learning_rate": 4.842105263157895e-06,
5518
+ "loss": 0.6691,
5519
+ "step": 7700
5520
+ },
5521
+ {
5522
+ "epoch": 0.19,
5523
+ "grad_norm": 2.0253312587738037,
5524
+ "learning_rate": 4.821052631578948e-06,
5525
+ "loss": 0.6483,
5526
+ "step": 7710
5527
+ },
5528
+ {
5529
+ "epoch": 0.19,
5530
+ "grad_norm": 8.230294227600098,
5531
+ "learning_rate": 4.800000000000001e-06,
5532
+ "loss": 0.8076,
5533
+ "step": 7720
5534
+ },
5535
+ {
5536
+ "epoch": 0.19,
5537
+ "grad_norm": 2.5444509983062744,
5538
+ "learning_rate": 4.778947368421053e-06,
5539
+ "loss": 0.7902,
5540
+ "step": 7730
5541
+ },
5542
+ {
5543
+ "epoch": 0.19,
5544
+ "grad_norm": 1.8759273290634155,
5545
+ "learning_rate": 4.757894736842106e-06,
5546
+ "loss": 0.7308,
5547
+ "step": 7740
5548
+ },
5549
+ {
5550
+ "epoch": 0.19,
5551
+ "grad_norm": 5.69119930267334,
5552
+ "learning_rate": 4.736842105263158e-06,
5553
+ "loss": 0.6605,
5554
+ "step": 7750
5555
+ },
5556
+ {
5557
+ "epoch": 0.19,
5558
+ "grad_norm": 7.020988941192627,
5559
+ "learning_rate": 4.71578947368421e-06,
5560
+ "loss": 0.7678,
5561
+ "step": 7760
5562
+ },
5563
+ {
5564
+ "epoch": 0.19,
5565
+ "grad_norm": 4.7685866355896,
5566
+ "learning_rate": 4.694736842105264e-06,
5567
+ "loss": 0.8022,
5568
+ "step": 7770
5569
+ },
5570
+ {
5571
+ "epoch": 0.19,
5572
+ "grad_norm": 2.516789436340332,
5573
+ "learning_rate": 4.6736842105263166e-06,
5574
+ "loss": 0.6176,
5575
+ "step": 7780
5576
+ },
5577
+ {
5578
+ "epoch": 0.19,
5579
+ "grad_norm": 4.267387866973877,
5580
+ "learning_rate": 4.652631578947368e-06,
5581
+ "loss": 0.6487,
5582
+ "step": 7790
5583
+ },
5584
+ {
5585
+ "epoch": 0.2,
5586
+ "grad_norm": 5.96762228012085,
5587
+ "learning_rate": 4.631578947368421e-06,
5588
+ "loss": 0.7066,
5589
+ "step": 7800
5590
+ },
5591
+ {
5592
+ "epoch": 0.2,
5593
+ "grad_norm": 4.345110893249512,
5594
+ "learning_rate": 4.6105263157894745e-06,
5595
+ "loss": 0.6072,
5596
+ "step": 7810
5597
+ },
5598
+ {
5599
+ "epoch": 0.2,
5600
+ "grad_norm": 10.33462142944336,
5601
+ "learning_rate": 4.589473684210526e-06,
5602
+ "loss": 0.8211,
5603
+ "step": 7820
5604
+ },
5605
+ {
5606
+ "epoch": 0.2,
5607
+ "grad_norm": 4.632289409637451,
5608
+ "learning_rate": 4.568421052631579e-06,
5609
+ "loss": 0.8335,
5610
+ "step": 7830
5611
+ },
5612
+ {
5613
+ "epoch": 0.2,
5614
+ "grad_norm": 4.453967094421387,
5615
+ "learning_rate": 4.547368421052632e-06,
5616
+ "loss": 0.8331,
5617
+ "step": 7840
5618
+ },
5619
+ {
5620
+ "epoch": 0.2,
5621
+ "grad_norm": 5.877091407775879,
5622
+ "learning_rate": 4.526315789473685e-06,
5623
+ "loss": 0.6793,
5624
+ "step": 7850
5625
+ },
5626
+ {
5627
+ "epoch": 0.2,
5628
+ "grad_norm": 16.41980743408203,
5629
+ "learning_rate": 4.505263157894737e-06,
5630
+ "loss": 0.819,
5631
+ "step": 7860
5632
+ },
5633
+ {
5634
+ "epoch": 0.2,
5635
+ "grad_norm": 3.1915693283081055,
5636
+ "learning_rate": 4.48421052631579e-06,
5637
+ "loss": 0.7217,
5638
+ "step": 7870
5639
+ },
5640
+ {
5641
+ "epoch": 0.2,
5642
+ "grad_norm": 5.805244445800781,
5643
+ "learning_rate": 4.463157894736842e-06,
5644
+ "loss": 0.7146,
5645
+ "step": 7880
5646
+ },
5647
+ {
5648
+ "epoch": 0.2,
5649
+ "grad_norm": 2.697472333908081,
5650
+ "learning_rate": 4.442105263157896e-06,
5651
+ "loss": 0.6748,
5652
+ "step": 7890
5653
+ },
5654
+ {
5655
+ "epoch": 0.2,
5656
+ "grad_norm": 3.6001346111297607,
5657
+ "learning_rate": 4.4210526315789476e-06,
5658
+ "loss": 0.6972,
5659
+ "step": 7900
5660
+ },
5661
+ {
5662
+ "epoch": 0.2,
5663
+ "grad_norm": 4.912445545196533,
5664
+ "learning_rate": 4.4e-06,
5665
+ "loss": 0.7157,
5666
+ "step": 7910
5667
+ },
5668
+ {
5669
+ "epoch": 0.2,
5670
+ "grad_norm": 6.9912309646606445,
5671
+ "learning_rate": 4.378947368421053e-06,
5672
+ "loss": 0.5927,
5673
+ "step": 7920
5674
+ },
5675
+ {
5676
+ "epoch": 0.2,
5677
+ "grad_norm": 4.380290985107422,
5678
+ "learning_rate": 4.3578947368421055e-06,
5679
+ "loss": 0.699,
5680
+ "step": 7930
5681
+ },
5682
+ {
5683
+ "epoch": 0.2,
5684
+ "grad_norm": 4.024576663970947,
5685
+ "learning_rate": 4.336842105263158e-06,
5686
+ "loss": 0.8156,
5687
+ "step": 7940
5688
+ },
5689
+ {
5690
+ "epoch": 0.2,
5691
+ "grad_norm": 3.523719310760498,
5692
+ "learning_rate": 4.315789473684211e-06,
5693
+ "loss": 0.7827,
5694
+ "step": 7950
5695
+ },
5696
+ {
5697
+ "epoch": 0.2,
5698
+ "grad_norm": 10.055171966552734,
5699
+ "learning_rate": 4.2947368421052635e-06,
5700
+ "loss": 0.7142,
5701
+ "step": 7960
5702
+ },
5703
+ {
5704
+ "epoch": 0.2,
5705
+ "grad_norm": 7.437203407287598,
5706
+ "learning_rate": 4.273684210526316e-06,
5707
+ "loss": 0.7184,
5708
+ "step": 7970
5709
+ },
5710
+ {
5711
+ "epoch": 0.2,
5712
+ "grad_norm": 2.6910207271575928,
5713
+ "learning_rate": 4.252631578947369e-06,
5714
+ "loss": 0.7311,
5715
+ "step": 7980
5716
+ },
5717
+ {
5718
+ "epoch": 0.2,
5719
+ "grad_norm": 12.729212760925293,
5720
+ "learning_rate": 4.2315789473684215e-06,
5721
+ "loss": 0.7629,
5722
+ "step": 7990
5723
+ },
5724
+ {
5725
+ "epoch": 0.2,
5726
+ "grad_norm": 3.817344903945923,
5727
+ "learning_rate": 4.210526315789474e-06,
5728
+ "loss": 0.8676,
5729
+ "step": 8000
5730
+ },
5731
+ {
5732
+ "epoch": 0.2,
5733
+ "eval_loss": 0.7396635413169861,
5734
+ "eval_runtime": 67.9126,
5735
+ "eval_samples_per_second": 14.725,
5736
+ "eval_steps_per_second": 14.725,
5737
+ "step": 8000
5738
+ },
5739
+ {
5740
+ "epoch": 0.2,
5741
+ "grad_norm": 5.193355083465576,
5742
+ "learning_rate": 4.189473684210527e-06,
5743
+ "loss": 0.7036,
5744
+ "step": 8010
5745
+ },
5746
+ {
5747
+ "epoch": 0.2,
5748
+ "grad_norm": 3.617652177810669,
5749
+ "learning_rate": 4.1684210526315794e-06,
5750
+ "loss": 0.6547,
5751
+ "step": 8020
5752
+ },
5753
+ {
5754
+ "epoch": 0.2,
5755
+ "grad_norm": 3.48286771774292,
5756
+ "learning_rate": 4.147368421052632e-06,
5757
+ "loss": 0.6756,
5758
+ "step": 8030
5759
+ },
5760
+ {
5761
+ "epoch": 0.2,
5762
+ "grad_norm": 4.939229965209961,
5763
+ "learning_rate": 4.126315789473685e-06,
5764
+ "loss": 0.7157,
5765
+ "step": 8040
5766
+ },
5767
+ {
5768
+ "epoch": 0.2,
5769
+ "grad_norm": 14.387231826782227,
5770
+ "learning_rate": 4.105263157894737e-06,
5771
+ "loss": 0.8052,
5772
+ "step": 8050
5773
+ },
5774
+ {
5775
+ "epoch": 0.2,
5776
+ "grad_norm": 4.042211055755615,
5777
+ "learning_rate": 4.08421052631579e-06,
5778
+ "loss": 0.6733,
5779
+ "step": 8060
5780
+ },
5781
+ {
5782
+ "epoch": 0.2,
5783
+ "grad_norm": 6.068091869354248,
5784
+ "learning_rate": 4.063157894736842e-06,
5785
+ "loss": 0.6172,
5786
+ "step": 8070
5787
+ },
5788
+ {
5789
+ "epoch": 0.2,
5790
+ "grad_norm": 5.004486083984375,
5791
+ "learning_rate": 4.042105263157895e-06,
5792
+ "loss": 0.7888,
5793
+ "step": 8080
5794
+ },
5795
+ {
5796
+ "epoch": 0.2,
5797
+ "grad_norm": 5.651116847991943,
5798
+ "learning_rate": 4.021052631578948e-06,
5799
+ "loss": 0.6979,
5800
+ "step": 8090
5801
+ },
5802
+ {
5803
+ "epoch": 0.2,
5804
+ "grad_norm": 3.581594944000244,
5805
+ "learning_rate": 4.000000000000001e-06,
5806
+ "loss": 0.7654,
5807
+ "step": 8100
5808
+ },
5809
+ {
5810
+ "epoch": 0.2,
5811
+ "grad_norm": 2.6030330657958984,
5812
+ "learning_rate": 3.9789473684210525e-06,
5813
+ "loss": 0.7946,
5814
+ "step": 8110
5815
+ },
5816
+ {
5817
+ "epoch": 0.2,
5818
+ "grad_norm": 5.385477542877197,
5819
+ "learning_rate": 3.957894736842106e-06,
5820
+ "loss": 0.7785,
5821
+ "step": 8120
5822
+ },
5823
+ {
5824
+ "epoch": 0.2,
5825
+ "grad_norm": 5.688074588775635,
5826
+ "learning_rate": 3.936842105263159e-06,
5827
+ "loss": 0.7762,
5828
+ "step": 8130
5829
+ },
5830
+ {
5831
+ "epoch": 0.2,
5832
+ "grad_norm": 1.7027924060821533,
5833
+ "learning_rate": 3.9157894736842104e-06,
5834
+ "loss": 0.6933,
5835
+ "step": 8140
5836
+ },
5837
+ {
5838
+ "epoch": 0.2,
5839
+ "grad_norm": 5.239694118499756,
5840
+ "learning_rate": 3.894736842105263e-06,
5841
+ "loss": 0.8061,
5842
+ "step": 8150
5843
+ },
5844
+ {
5845
+ "epoch": 0.2,
5846
+ "grad_norm": 4.3939032554626465,
5847
+ "learning_rate": 3.873684210526316e-06,
5848
+ "loss": 0.7537,
5849
+ "step": 8160
5850
+ },
5851
+ {
5852
+ "epoch": 0.2,
5853
+ "grad_norm": 5.115386962890625,
5854
+ "learning_rate": 3.852631578947369e-06,
5855
+ "loss": 0.7025,
5856
+ "step": 8170
5857
+ },
5858
+ {
5859
+ "epoch": 0.2,
5860
+ "grad_norm": 4.546750545501709,
5861
+ "learning_rate": 3.831578947368421e-06,
5862
+ "loss": 0.7108,
5863
+ "step": 8180
5864
+ },
5865
+ {
5866
+ "epoch": 0.2,
5867
+ "grad_norm": 3.043384552001953,
5868
+ "learning_rate": 3.810526315789474e-06,
5869
+ "loss": 0.7506,
5870
+ "step": 8190
5871
+ },
5872
+ {
5873
+ "epoch": 0.2,
5874
+ "grad_norm": 2.8117778301239014,
5875
+ "learning_rate": 3.789473684210527e-06,
5876
+ "loss": 0.773,
5877
+ "step": 8200
5878
+ },
5879
+ {
5880
+ "epoch": 0.21,
5881
+ "grad_norm": 6.000233173370361,
5882
+ "learning_rate": 3.768421052631579e-06,
5883
+ "loss": 0.6902,
5884
+ "step": 8210
5885
+ },
5886
+ {
5887
+ "epoch": 0.21,
5888
+ "grad_norm": 6.7739787101745605,
5889
+ "learning_rate": 3.7473684210526317e-06,
5890
+ "loss": 0.6397,
5891
+ "step": 8220
5892
+ },
5893
+ {
5894
+ "epoch": 0.21,
5895
+ "grad_norm": 4.948480129241943,
5896
+ "learning_rate": 3.7263157894736848e-06,
5897
+ "loss": 0.6185,
5898
+ "step": 8230
5899
+ },
5900
+ {
5901
+ "epoch": 0.21,
5902
+ "grad_norm": 4.269702434539795,
5903
+ "learning_rate": 3.7052631578947374e-06,
5904
+ "loss": 0.7487,
5905
+ "step": 8240
5906
+ },
5907
+ {
5908
+ "epoch": 0.21,
5909
+ "grad_norm": 3.8336634635925293,
5910
+ "learning_rate": 3.6842105263157896e-06,
5911
+ "loss": 0.7805,
5912
+ "step": 8250
5913
+ },
5914
+ {
5915
+ "epoch": 0.21,
5916
+ "grad_norm": 4.896543979644775,
5917
+ "learning_rate": 3.6631578947368423e-06,
5918
+ "loss": 0.645,
5919
+ "step": 8260
5920
+ },
5921
+ {
5922
+ "epoch": 0.21,
5923
+ "grad_norm": 6.051191806793213,
5924
+ "learning_rate": 3.642105263157895e-06,
5925
+ "loss": 0.7477,
5926
+ "step": 8270
5927
+ },
5928
+ {
5929
+ "epoch": 0.21,
5930
+ "grad_norm": 24.540451049804688,
5931
+ "learning_rate": 3.621052631578948e-06,
5932
+ "loss": 0.8168,
5933
+ "step": 8280
5934
+ },
5935
+ {
5936
+ "epoch": 0.21,
5937
+ "grad_norm": 5.061807155609131,
5938
+ "learning_rate": 3.6000000000000003e-06,
5939
+ "loss": 0.727,
5940
+ "step": 8290
5941
+ },
5942
+ {
5943
+ "epoch": 0.21,
5944
+ "grad_norm": 2.3907368183135986,
5945
+ "learning_rate": 3.578947368421053e-06,
5946
+ "loss": 0.6614,
5947
+ "step": 8300
5948
+ },
5949
+ {
5950
+ "epoch": 0.21,
5951
+ "grad_norm": 4.554809093475342,
5952
+ "learning_rate": 3.5578947368421056e-06,
5953
+ "loss": 0.6947,
5954
+ "step": 8310
5955
+ },
5956
+ {
5957
+ "epoch": 0.21,
5958
+ "grad_norm": 3.7383534908294678,
5959
+ "learning_rate": 3.536842105263158e-06,
5960
+ "loss": 0.6171,
5961
+ "step": 8320
5962
+ },
5963
+ {
5964
+ "epoch": 0.21,
5965
+ "grad_norm": 4.406937122344971,
5966
+ "learning_rate": 3.515789473684211e-06,
5967
+ "loss": 0.6102,
5968
+ "step": 8330
5969
+ },
5970
+ {
5971
+ "epoch": 0.21,
5972
+ "grad_norm": 5.226219654083252,
5973
+ "learning_rate": 3.4947368421052635e-06,
5974
+ "loss": 0.7746,
5975
+ "step": 8340
5976
+ },
5977
+ {
5978
+ "epoch": 0.21,
5979
+ "grad_norm": 6.249040126800537,
5980
+ "learning_rate": 3.473684210526316e-06,
5981
+ "loss": 0.7158,
5982
+ "step": 8350
5983
+ },
5984
+ {
5985
+ "epoch": 0.21,
5986
+ "grad_norm": 6.806312084197998,
5987
+ "learning_rate": 3.4526315789473684e-06,
5988
+ "loss": 0.7249,
5989
+ "step": 8360
5990
+ },
5991
+ {
5992
+ "epoch": 0.21,
5993
+ "grad_norm": 2.993473529815674,
5994
+ "learning_rate": 3.4315789473684215e-06,
5995
+ "loss": 0.826,
5996
+ "step": 8370
5997
+ },
5998
+ {
5999
+ "epoch": 0.21,
6000
+ "grad_norm": 4.120741367340088,
6001
+ "learning_rate": 3.410526315789474e-06,
6002
+ "loss": 0.6238,
6003
+ "step": 8380
6004
+ },
6005
+ {
6006
+ "epoch": 0.21,
6007
+ "grad_norm": 4.020960807800293,
6008
+ "learning_rate": 3.3894736842105264e-06,
6009
+ "loss": 0.6749,
6010
+ "step": 8390
6011
+ },
6012
+ {
6013
+ "epoch": 0.21,
6014
+ "grad_norm": 6.000002384185791,
6015
+ "learning_rate": 3.368421052631579e-06,
6016
+ "loss": 0.7652,
6017
+ "step": 8400
6018
+ },
6019
+ {
6020
+ "epoch": 0.21,
6021
+ "grad_norm": 8.221445083618164,
6022
+ "learning_rate": 3.347368421052632e-06,
6023
+ "loss": 0.7781,
6024
+ "step": 8410
6025
+ },
6026
+ {
6027
+ "epoch": 0.21,
6028
+ "grad_norm": 5.850223541259766,
6029
+ "learning_rate": 3.3263157894736848e-06,
6030
+ "loss": 0.7555,
6031
+ "step": 8420
6032
+ },
6033
+ {
6034
+ "epoch": 0.21,
6035
+ "grad_norm": 2.249915838241577,
6036
+ "learning_rate": 3.305263157894737e-06,
6037
+ "loss": 0.7305,
6038
+ "step": 8430
6039
+ },
6040
+ {
6041
+ "epoch": 0.21,
6042
+ "grad_norm": 4.955141067504883,
6043
+ "learning_rate": 3.2842105263157897e-06,
6044
+ "loss": 0.6817,
6045
+ "step": 8440
6046
+ },
6047
+ {
6048
+ "epoch": 0.21,
6049
+ "grad_norm": 2.4711403846740723,
6050
+ "learning_rate": 3.2631578947368423e-06,
6051
+ "loss": 0.683,
6052
+ "step": 8450
6053
+ },
6054
+ {
6055
+ "epoch": 0.21,
6056
+ "grad_norm": 5.367486953735352,
6057
+ "learning_rate": 3.2421052631578945e-06,
6058
+ "loss": 0.6494,
6059
+ "step": 8460
6060
+ },
6061
+ {
6062
+ "epoch": 0.21,
6063
+ "grad_norm": 3.283465623855591,
6064
+ "learning_rate": 3.2210526315789476e-06,
6065
+ "loss": 0.6092,
6066
+ "step": 8470
6067
+ },
6068
+ {
6069
+ "epoch": 0.21,
6070
+ "grad_norm": 4.473137855529785,
6071
+ "learning_rate": 3.2000000000000003e-06,
6072
+ "loss": 0.676,
6073
+ "step": 8480
6074
+ },
6075
+ {
6076
+ "epoch": 0.21,
6077
+ "grad_norm": 3.177180528640747,
6078
+ "learning_rate": 3.178947368421053e-06,
6079
+ "loss": 0.6685,
6080
+ "step": 8490
6081
+ },
6082
+ {
6083
+ "epoch": 0.21,
6084
+ "grad_norm": 4.735683441162109,
6085
+ "learning_rate": 3.157894736842105e-06,
6086
+ "loss": 0.7544,
6087
+ "step": 8500
6088
+ },
6089
+ {
6090
+ "epoch": 0.21,
6091
+ "eval_loss": 0.7582711577415466,
6092
+ "eval_runtime": 67.8631,
6093
+ "eval_samples_per_second": 14.736,
6094
+ "eval_steps_per_second": 14.736,
6095
+ "step": 8500
6096
+ },
6097
+ {
6098
+ "epoch": 0.21,
6099
+ "grad_norm": 4.465471267700195,
6100
+ "learning_rate": 3.1368421052631582e-06,
6101
+ "loss": 0.8191,
6102
+ "step": 8510
6103
+ },
6104
+ {
6105
+ "epoch": 0.21,
6106
+ "grad_norm": 3.8849751949310303,
6107
+ "learning_rate": 3.115789473684211e-06,
6108
+ "loss": 0.7078,
6109
+ "step": 8520
6110
+ },
6111
+ {
6112
+ "epoch": 0.21,
6113
+ "grad_norm": 5.555447101593018,
6114
+ "learning_rate": 3.094736842105263e-06,
6115
+ "loss": 0.7332,
6116
+ "step": 8530
6117
+ },
6118
+ {
6119
+ "epoch": 0.21,
6120
+ "grad_norm": 4.269344806671143,
6121
+ "learning_rate": 3.0736842105263158e-06,
6122
+ "loss": 0.7619,
6123
+ "step": 8540
6124
+ },
6125
+ {
6126
+ "epoch": 0.21,
6127
+ "grad_norm": 5.792567729949951,
6128
+ "learning_rate": 3.052631578947369e-06,
6129
+ "loss": 0.6858,
6130
+ "step": 8550
6131
+ },
6132
+ {
6133
+ "epoch": 0.21,
6134
+ "grad_norm": 4.095942974090576,
6135
+ "learning_rate": 3.0315789473684215e-06,
6136
+ "loss": 0.7793,
6137
+ "step": 8560
6138
+ },
6139
+ {
6140
+ "epoch": 0.21,
6141
+ "grad_norm": 3.316791296005249,
6142
+ "learning_rate": 3.0105263157894737e-06,
6143
+ "loss": 0.666,
6144
+ "step": 8570
6145
+ },
6146
+ {
6147
+ "epoch": 0.21,
6148
+ "grad_norm": 4.55336332321167,
6149
+ "learning_rate": 2.9894736842105264e-06,
6150
+ "loss": 0.7723,
6151
+ "step": 8580
6152
+ },
6153
+ {
6154
+ "epoch": 0.21,
6155
+ "grad_norm": 7.5306315422058105,
6156
+ "learning_rate": 2.9684210526315795e-06,
6157
+ "loss": 0.7283,
6158
+ "step": 8590
6159
+ },
6160
+ {
6161
+ "epoch": 0.21,
6162
+ "grad_norm": 3.935115337371826,
6163
+ "learning_rate": 2.9473684210526317e-06,
6164
+ "loss": 0.7843,
6165
+ "step": 8600
6166
+ },
6167
+ {
6168
+ "epoch": 0.22,
6169
+ "grad_norm": 5.173915863037109,
6170
+ "learning_rate": 2.9263157894736844e-06,
6171
+ "loss": 0.6662,
6172
+ "step": 8610
6173
+ },
6174
+ {
6175
+ "epoch": 0.22,
6176
+ "grad_norm": 3.5214264392852783,
6177
+ "learning_rate": 2.905263157894737e-06,
6178
+ "loss": 0.6887,
6179
+ "step": 8620
6180
+ },
6181
+ {
6182
+ "epoch": 0.22,
6183
+ "grad_norm": 4.139004707336426,
6184
+ "learning_rate": 2.88421052631579e-06,
6185
+ "loss": 0.6778,
6186
+ "step": 8630
6187
+ },
6188
+ {
6189
+ "epoch": 0.22,
6190
+ "grad_norm": 4.185042381286621,
6191
+ "learning_rate": 2.8631578947368423e-06,
6192
+ "loss": 0.9094,
6193
+ "step": 8640
6194
+ },
6195
+ {
6196
+ "epoch": 0.22,
6197
+ "grad_norm": 3.3607513904571533,
6198
+ "learning_rate": 2.842105263157895e-06,
6199
+ "loss": 0.7918,
6200
+ "step": 8650
6201
+ },
6202
+ {
6203
+ "epoch": 0.22,
6204
+ "grad_norm": 5.062870502471924,
6205
+ "learning_rate": 2.8210526315789476e-06,
6206
+ "loss": 0.7694,
6207
+ "step": 8660
6208
+ },
6209
+ {
6210
+ "epoch": 0.22,
6211
+ "grad_norm": 5.099003791809082,
6212
+ "learning_rate": 2.8000000000000003e-06,
6213
+ "loss": 0.7301,
6214
+ "step": 8670
6215
+ },
6216
+ {
6217
+ "epoch": 0.22,
6218
+ "grad_norm": 5.512063026428223,
6219
+ "learning_rate": 2.7789473684210525e-06,
6220
+ "loss": 0.7887,
6221
+ "step": 8680
6222
+ },
6223
+ {
6224
+ "epoch": 0.22,
6225
+ "grad_norm": 3.625652551651001,
6226
+ "learning_rate": 2.7578947368421056e-06,
6227
+ "loss": 0.7781,
6228
+ "step": 8690
6229
+ },
6230
+ {
6231
+ "epoch": 0.22,
6232
+ "grad_norm": 2.8921008110046387,
6233
+ "learning_rate": 2.7368421052631583e-06,
6234
+ "loss": 0.7582,
6235
+ "step": 8700
6236
+ },
6237
+ {
6238
+ "epoch": 0.22,
6239
+ "grad_norm": 10.71945571899414,
6240
+ "learning_rate": 2.7157894736842105e-06,
6241
+ "loss": 0.7234,
6242
+ "step": 8710
6243
+ },
6244
+ {
6245
+ "epoch": 0.22,
6246
+ "grad_norm": 17.737136840820312,
6247
+ "learning_rate": 2.694736842105263e-06,
6248
+ "loss": 0.6298,
6249
+ "step": 8720
6250
+ },
6251
+ {
6252
+ "epoch": 0.22,
6253
+ "grad_norm": 9.8464994430542,
6254
+ "learning_rate": 2.6736842105263162e-06,
6255
+ "loss": 0.7856,
6256
+ "step": 8730
6257
+ },
6258
+ {
6259
+ "epoch": 0.22,
6260
+ "grad_norm": 7.925550937652588,
6261
+ "learning_rate": 2.652631578947369e-06,
6262
+ "loss": 0.8387,
6263
+ "step": 8740
6264
+ },
6265
+ {
6266
+ "epoch": 0.22,
6267
+ "grad_norm": 3.530381441116333,
6268
+ "learning_rate": 2.631578947368421e-06,
6269
+ "loss": 0.8223,
6270
+ "step": 8750
6271
+ },
6272
+ {
6273
+ "epoch": 0.22,
6274
+ "grad_norm": 6.403299808502197,
6275
+ "learning_rate": 2.6105263157894738e-06,
6276
+ "loss": 0.8079,
6277
+ "step": 8760
6278
+ },
6279
+ {
6280
+ "epoch": 0.22,
6281
+ "grad_norm": 5.1753740310668945,
6282
+ "learning_rate": 2.589473684210527e-06,
6283
+ "loss": 0.7888,
6284
+ "step": 8770
6285
+ },
6286
+ {
6287
+ "epoch": 0.22,
6288
+ "grad_norm": 2.760190725326538,
6289
+ "learning_rate": 2.568421052631579e-06,
6290
+ "loss": 0.7071,
6291
+ "step": 8780
6292
+ },
6293
+ {
6294
+ "epoch": 0.22,
6295
+ "grad_norm": 5.183119297027588,
6296
+ "learning_rate": 2.5473684210526317e-06,
6297
+ "loss": 0.619,
6298
+ "step": 8790
6299
+ },
6300
+ {
6301
+ "epoch": 0.22,
6302
+ "grad_norm": 5.66708517074585,
6303
+ "learning_rate": 2.5263157894736844e-06,
6304
+ "loss": 0.7888,
6305
+ "step": 8800
6306
+ },
6307
+ {
6308
+ "epoch": 0.22,
6309
+ "grad_norm": 2.3660988807678223,
6310
+ "learning_rate": 2.5052631578947375e-06,
6311
+ "loss": 0.7466,
6312
+ "step": 8810
6313
+ },
6314
+ {
6315
+ "epoch": 0.22,
6316
+ "grad_norm": 3.8384206295013428,
6317
+ "learning_rate": 2.4842105263157897e-06,
6318
+ "loss": 0.7371,
6319
+ "step": 8820
6320
+ },
6321
+ {
6322
+ "epoch": 0.22,
6323
+ "grad_norm": 3.593717336654663,
6324
+ "learning_rate": 2.4631578947368424e-06,
6325
+ "loss": 0.5967,
6326
+ "step": 8830
6327
+ },
6328
+ {
6329
+ "epoch": 0.22,
6330
+ "grad_norm": 2.778346538543701,
6331
+ "learning_rate": 2.442105263157895e-06,
6332
+ "loss": 0.6407,
6333
+ "step": 8840
6334
+ },
6335
+ {
6336
+ "epoch": 0.22,
6337
+ "grad_norm": 10.841148376464844,
6338
+ "learning_rate": 2.4210526315789477e-06,
6339
+ "loss": 0.8172,
6340
+ "step": 8850
6341
+ },
6342
+ {
6343
+ "epoch": 0.22,
6344
+ "grad_norm": 2.635694980621338,
6345
+ "learning_rate": 2.4000000000000003e-06,
6346
+ "loss": 0.8135,
6347
+ "step": 8860
6348
+ },
6349
+ {
6350
+ "epoch": 0.22,
6351
+ "grad_norm": 1.5510995388031006,
6352
+ "learning_rate": 2.378947368421053e-06,
6353
+ "loss": 0.8328,
6354
+ "step": 8870
6355
+ },
6356
+ {
6357
+ "epoch": 0.22,
6358
+ "grad_norm": 3.770972967147827,
6359
+ "learning_rate": 2.357894736842105e-06,
6360
+ "loss": 0.6642,
6361
+ "step": 8880
6362
+ },
6363
+ {
6364
+ "epoch": 0.22,
6365
+ "grad_norm": 5.756451606750488,
6366
+ "learning_rate": 2.3368421052631583e-06,
6367
+ "loss": 0.7484,
6368
+ "step": 8890
6369
+ },
6370
+ {
6371
+ "epoch": 0.22,
6372
+ "grad_norm": 2.9202377796173096,
6373
+ "learning_rate": 2.3157894736842105e-06,
6374
+ "loss": 0.7381,
6375
+ "step": 8900
6376
+ },
6377
+ {
6378
+ "epoch": 0.22,
6379
+ "grad_norm": 4.43782377243042,
6380
+ "learning_rate": 2.294736842105263e-06,
6381
+ "loss": 0.7915,
6382
+ "step": 8910
6383
+ },
6384
+ {
6385
+ "epoch": 0.22,
6386
+ "grad_norm": 20.496152877807617,
6387
+ "learning_rate": 2.273684210526316e-06,
6388
+ "loss": 0.6872,
6389
+ "step": 8920
6390
+ },
6391
+ {
6392
+ "epoch": 0.22,
6393
+ "grad_norm": 3.2591583728790283,
6394
+ "learning_rate": 2.2526315789473685e-06,
6395
+ "loss": 0.668,
6396
+ "step": 8930
6397
+ },
6398
+ {
6399
+ "epoch": 0.22,
6400
+ "grad_norm": 2.23056960105896,
6401
+ "learning_rate": 2.231578947368421e-06,
6402
+ "loss": 0.6229,
6403
+ "step": 8940
6404
+ },
6405
+ {
6406
+ "epoch": 0.22,
6407
+ "grad_norm": 5.419168949127197,
6408
+ "learning_rate": 2.2105263157894738e-06,
6409
+ "loss": 0.9534,
6410
+ "step": 8950
6411
+ },
6412
+ {
6413
+ "epoch": 0.22,
6414
+ "grad_norm": 15.681089401245117,
6415
+ "learning_rate": 2.1894736842105264e-06,
6416
+ "loss": 0.782,
6417
+ "step": 8960
6418
+ },
6419
+ {
6420
+ "epoch": 0.22,
6421
+ "grad_norm": 3.7693331241607666,
6422
+ "learning_rate": 2.168421052631579e-06,
6423
+ "loss": 0.8047,
6424
+ "step": 8970
6425
+ },
6426
+ {
6427
+ "epoch": 0.22,
6428
+ "grad_norm": 3.4705393314361572,
6429
+ "learning_rate": 2.1473684210526317e-06,
6430
+ "loss": 0.7832,
6431
+ "step": 8980
6432
+ },
6433
+ {
6434
+ "epoch": 0.22,
6435
+ "grad_norm": 4.295872688293457,
6436
+ "learning_rate": 2.1263157894736844e-06,
6437
+ "loss": 0.7355,
6438
+ "step": 8990
6439
+ },
6440
+ {
6441
+ "epoch": 0.23,
6442
+ "grad_norm": 3.0480620861053467,
6443
+ "learning_rate": 2.105263157894737e-06,
6444
+ "loss": 0.6739,
6445
+ "step": 9000
6446
+ },
6447
+ {
6448
+ "epoch": 0.23,
6449
+ "eval_loss": 0.7442497611045837,
6450
+ "eval_runtime": 67.8767,
6451
+ "eval_samples_per_second": 14.733,
6452
+ "eval_steps_per_second": 14.733,
6453
+ "step": 9000
6454
+ },
6455
+ {
6456
+ "epoch": 0.23,
6457
+ "grad_norm": 2.9723927974700928,
6458
+ "learning_rate": 2.0842105263157897e-06,
6459
+ "loss": 0.7003,
6460
+ "step": 9010
6461
+ },
6462
+ {
6463
+ "epoch": 0.23,
6464
+ "grad_norm": 2.0932421684265137,
6465
+ "learning_rate": 2.0631578947368424e-06,
6466
+ "loss": 0.6897,
6467
+ "step": 9020
6468
+ },
6469
+ {
6470
+ "epoch": 0.23,
6471
+ "grad_norm": 4.70625114440918,
6472
+ "learning_rate": 2.042105263157895e-06,
6473
+ "loss": 0.8106,
6474
+ "step": 9030
6475
+ },
6476
+ {
6477
+ "epoch": 0.23,
6478
+ "grad_norm": 3.2763564586639404,
6479
+ "learning_rate": 2.0210526315789477e-06,
6480
+ "loss": 0.7387,
6481
+ "step": 9040
6482
+ },
6483
+ {
6484
+ "epoch": 0.23,
6485
+ "grad_norm": 4.553431034088135,
6486
+ "learning_rate": 2.0000000000000003e-06,
6487
+ "loss": 0.7435,
6488
+ "step": 9050
6489
+ },
6490
+ {
6491
+ "epoch": 0.23,
6492
+ "grad_norm": 5.36479377746582,
6493
+ "learning_rate": 1.978947368421053e-06,
6494
+ "loss": 0.7713,
6495
+ "step": 9060
6496
+ },
6497
+ {
6498
+ "epoch": 0.23,
6499
+ "grad_norm": 4.923874855041504,
6500
+ "learning_rate": 1.9578947368421052e-06,
6501
+ "loss": 0.5508,
6502
+ "step": 9070
6503
+ },
6504
+ {
6505
+ "epoch": 0.23,
6506
+ "grad_norm": 8.63404655456543,
6507
+ "learning_rate": 1.936842105263158e-06,
6508
+ "loss": 0.7323,
6509
+ "step": 9080
6510
+ },
6511
+ {
6512
+ "epoch": 0.23,
6513
+ "grad_norm": 5.521135330200195,
6514
+ "learning_rate": 1.9157894736842105e-06,
6515
+ "loss": 0.699,
6516
+ "step": 9090
6517
+ },
6518
+ {
6519
+ "epoch": 0.23,
6520
+ "grad_norm": 9.009405136108398,
6521
+ "learning_rate": 1.8947368421052634e-06,
6522
+ "loss": 0.789,
6523
+ "step": 9100
6524
+ },
6525
+ {
6526
+ "epoch": 0.23,
6527
+ "grad_norm": 12.834007263183594,
6528
+ "learning_rate": 1.8736842105263158e-06,
6529
+ "loss": 0.7382,
6530
+ "step": 9110
6531
+ },
6532
+ {
6533
+ "epoch": 0.23,
6534
+ "grad_norm": 3.753262758255005,
6535
+ "learning_rate": 1.8526315789473687e-06,
6536
+ "loss": 0.7035,
6537
+ "step": 9120
6538
+ },
6539
+ {
6540
+ "epoch": 0.23,
6541
+ "grad_norm": 3.300708770751953,
6542
+ "learning_rate": 1.8315789473684211e-06,
6543
+ "loss": 0.7558,
6544
+ "step": 9130
6545
+ },
6546
+ {
6547
+ "epoch": 0.23,
6548
+ "grad_norm": 4.416452884674072,
6549
+ "learning_rate": 1.810526315789474e-06,
6550
+ "loss": 0.6854,
6551
+ "step": 9140
6552
+ },
6553
+ {
6554
+ "epoch": 0.23,
6555
+ "grad_norm": 7.664788722991943,
6556
+ "learning_rate": 1.7894736842105265e-06,
6557
+ "loss": 0.6951,
6558
+ "step": 9150
6559
+ },
6560
+ {
6561
+ "epoch": 0.23,
6562
+ "grad_norm": 3.646073818206787,
6563
+ "learning_rate": 1.768421052631579e-06,
6564
+ "loss": 0.7472,
6565
+ "step": 9160
6566
+ },
6567
+ {
6568
+ "epoch": 0.23,
6569
+ "grad_norm": 3.125991106033325,
6570
+ "learning_rate": 1.7473684210526318e-06,
6571
+ "loss": 0.6711,
6572
+ "step": 9170
6573
+ },
6574
+ {
6575
+ "epoch": 0.23,
6576
+ "grad_norm": 5.308753967285156,
6577
+ "learning_rate": 1.7263157894736842e-06,
6578
+ "loss": 0.6393,
6579
+ "step": 9180
6580
+ },
6581
+ {
6582
+ "epoch": 0.23,
6583
+ "grad_norm": 11.79830265045166,
6584
+ "learning_rate": 1.705263157894737e-06,
6585
+ "loss": 0.7358,
6586
+ "step": 9190
6587
+ },
6588
+ {
6589
+ "epoch": 0.23,
6590
+ "grad_norm": 6.862399101257324,
6591
+ "learning_rate": 1.6842105263157895e-06,
6592
+ "loss": 0.8422,
6593
+ "step": 9200
6594
+ },
6595
+ {
6596
+ "epoch": 0.23,
6597
+ "grad_norm": 5.3199968338012695,
6598
+ "learning_rate": 1.6631578947368424e-06,
6599
+ "loss": 0.6999,
6600
+ "step": 9210
6601
+ },
6602
+ {
6603
+ "epoch": 0.23,
6604
+ "grad_norm": 3.263275146484375,
6605
+ "learning_rate": 1.6421052631578948e-06,
6606
+ "loss": 0.7122,
6607
+ "step": 9220
6608
+ },
6609
+ {
6610
+ "epoch": 0.23,
6611
+ "grad_norm": 4.283051490783691,
6612
+ "learning_rate": 1.6210526315789473e-06,
6613
+ "loss": 0.7793,
6614
+ "step": 9230
6615
+ },
6616
+ {
6617
+ "epoch": 0.23,
6618
+ "grad_norm": 2.0055785179138184,
6619
+ "learning_rate": 1.6000000000000001e-06,
6620
+ "loss": 0.732,
6621
+ "step": 9240
6622
+ },
6623
+ {
6624
+ "epoch": 0.23,
6625
+ "grad_norm": 4.184137344360352,
6626
+ "learning_rate": 1.5789473684210526e-06,
6627
+ "loss": 0.7339,
6628
+ "step": 9250
6629
+ },
6630
+ {
6631
+ "epoch": 0.23,
6632
+ "grad_norm": 3.587636709213257,
6633
+ "learning_rate": 1.5578947368421054e-06,
6634
+ "loss": 0.8473,
6635
+ "step": 9260
6636
+ },
6637
+ {
6638
+ "epoch": 0.23,
6639
+ "grad_norm": 8.189043045043945,
6640
+ "learning_rate": 1.5368421052631579e-06,
6641
+ "loss": 0.6498,
6642
+ "step": 9270
6643
+ },
6644
+ {
6645
+ "epoch": 0.23,
6646
+ "grad_norm": 3.4272284507751465,
6647
+ "learning_rate": 1.5157894736842108e-06,
6648
+ "loss": 0.7676,
6649
+ "step": 9280
6650
+ },
6651
+ {
6652
+ "epoch": 0.23,
6653
+ "grad_norm": 3.280287027359009,
6654
+ "learning_rate": 1.4947368421052632e-06,
6655
+ "loss": 0.6283,
6656
+ "step": 9290
6657
+ },
6658
+ {
6659
+ "epoch": 0.23,
6660
+ "grad_norm": 8.722474098205566,
6661
+ "learning_rate": 1.4736842105263159e-06,
6662
+ "loss": 0.7555,
6663
+ "step": 9300
6664
+ },
6665
+ {
6666
+ "epoch": 0.23,
6667
+ "grad_norm": 4.574818134307861,
6668
+ "learning_rate": 1.4526315789473685e-06,
6669
+ "loss": 0.7481,
6670
+ "step": 9310
6671
+ },
6672
+ {
6673
+ "epoch": 0.23,
6674
+ "grad_norm": 3.0097527503967285,
6675
+ "learning_rate": 1.4315789473684212e-06,
6676
+ "loss": 0.6181,
6677
+ "step": 9320
6678
+ },
6679
+ {
6680
+ "epoch": 0.23,
6681
+ "grad_norm": 6.725505352020264,
6682
+ "learning_rate": 1.4105263157894738e-06,
6683
+ "loss": 0.677,
6684
+ "step": 9330
6685
+ },
6686
+ {
6687
+ "epoch": 0.23,
6688
+ "grad_norm": 2.934959888458252,
6689
+ "learning_rate": 1.3894736842105263e-06,
6690
+ "loss": 0.6932,
6691
+ "step": 9340
6692
+ },
6693
+ {
6694
+ "epoch": 0.23,
6695
+ "grad_norm": 2.7491650581359863,
6696
+ "learning_rate": 1.3684210526315791e-06,
6697
+ "loss": 0.7361,
6698
+ "step": 9350
6699
+ },
6700
+ {
6701
+ "epoch": 0.23,
6702
+ "grad_norm": 4.734315872192383,
6703
+ "learning_rate": 1.3473684210526316e-06,
6704
+ "loss": 0.6442,
6705
+ "step": 9360
6706
+ },
6707
+ {
6708
+ "epoch": 0.23,
6709
+ "grad_norm": 4.301790714263916,
6710
+ "learning_rate": 1.3263157894736844e-06,
6711
+ "loss": 0.7642,
6712
+ "step": 9370
6713
+ },
6714
+ {
6715
+ "epoch": 0.23,
6716
+ "grad_norm": 4.042958736419678,
6717
+ "learning_rate": 1.3052631578947369e-06,
6718
+ "loss": 0.7974,
6719
+ "step": 9380
6720
+ },
6721
+ {
6722
+ "epoch": 0.23,
6723
+ "grad_norm": 4.941096782684326,
6724
+ "learning_rate": 1.2842105263157895e-06,
6725
+ "loss": 0.8603,
6726
+ "step": 9390
6727
+ },
6728
+ {
6729
+ "epoch": 0.23,
6730
+ "grad_norm": 4.379117488861084,
6731
+ "learning_rate": 1.2631578947368422e-06,
6732
+ "loss": 0.8297,
6733
+ "step": 9400
6734
+ },
6735
+ {
6736
+ "epoch": 0.24,
6737
+ "grad_norm": 6.3129048347473145,
6738
+ "learning_rate": 1.2421052631578948e-06,
6739
+ "loss": 0.7783,
6740
+ "step": 9410
6741
+ },
6742
+ {
6743
+ "epoch": 0.24,
6744
+ "grad_norm": 5.5439133644104,
6745
+ "learning_rate": 1.2210526315789475e-06,
6746
+ "loss": 0.8122,
6747
+ "step": 9420
6748
+ },
6749
+ {
6750
+ "epoch": 0.24,
6751
+ "grad_norm": 6.480744361877441,
6752
+ "learning_rate": 1.2000000000000002e-06,
6753
+ "loss": 0.7779,
6754
+ "step": 9430
6755
+ },
6756
+ {
6757
+ "epoch": 0.24,
6758
+ "grad_norm": 5.862485408782959,
6759
+ "learning_rate": 1.1789473684210526e-06,
6760
+ "loss": 0.6917,
6761
+ "step": 9440
6762
+ },
6763
+ {
6764
+ "epoch": 0.24,
6765
+ "grad_norm": 5.7247443199157715,
6766
+ "learning_rate": 1.1578947368421053e-06,
6767
+ "loss": 0.7017,
6768
+ "step": 9450
6769
+ },
6770
+ {
6771
+ "epoch": 0.24,
6772
+ "grad_norm": 8.194451332092285,
6773
+ "learning_rate": 1.136842105263158e-06,
6774
+ "loss": 0.7031,
6775
+ "step": 9460
6776
+ },
6777
+ {
6778
+ "epoch": 0.24,
6779
+ "grad_norm": 8.057929992675781,
6780
+ "learning_rate": 1.1157894736842106e-06,
6781
+ "loss": 0.7116,
6782
+ "step": 9470
6783
+ },
6784
+ {
6785
+ "epoch": 0.24,
6786
+ "grad_norm": 4.529337406158447,
6787
+ "learning_rate": 1.0947368421052632e-06,
6788
+ "loss": 0.8314,
6789
+ "step": 9480
6790
+ },
6791
+ {
6792
+ "epoch": 0.24,
6793
+ "grad_norm": 7.412846565246582,
6794
+ "learning_rate": 1.0736842105263159e-06,
6795
+ "loss": 0.6448,
6796
+ "step": 9490
6797
+ },
6798
+ {
6799
+ "epoch": 0.24,
6800
+ "grad_norm": 3.7076497077941895,
6801
+ "learning_rate": 1.0526315789473685e-06,
6802
+ "loss": 0.6291,
6803
+ "step": 9500
6804
+ },
6805
+ {
6806
+ "epoch": 0.24,
6807
+ "eval_loss": 0.7395394444465637,
6808
+ "eval_runtime": 67.8841,
6809
+ "eval_samples_per_second": 14.731,
6810
+ "eval_steps_per_second": 14.731,
6811
+ "step": 9500
6812
+ },
6813
+ {
6814
+ "epoch": 0.24,
6815
+ "grad_norm": 4.488115310668945,
6816
+ "learning_rate": 1.0315789473684212e-06,
6817
+ "loss": 0.8611,
6818
+ "step": 9510
6819
+ },
6820
+ {
6821
+ "epoch": 0.24,
6822
+ "grad_norm": 1.6314383745193481,
6823
+ "learning_rate": 1.0105263157894738e-06,
6824
+ "loss": 0.7694,
6825
+ "step": 9520
6826
+ },
6827
+ {
6828
+ "epoch": 0.24,
6829
+ "grad_norm": 5.290372848510742,
6830
+ "learning_rate": 9.894736842105265e-07,
6831
+ "loss": 0.7166,
6832
+ "step": 9530
6833
+ },
6834
+ {
6835
+ "epoch": 0.24,
6836
+ "grad_norm": 3.1572625637054443,
6837
+ "learning_rate": 9.68421052631579e-07,
6838
+ "loss": 0.7649,
6839
+ "step": 9540
6840
+ },
6841
+ {
6842
+ "epoch": 0.24,
6843
+ "grad_norm": 4.951930999755859,
6844
+ "learning_rate": 9.473684210526317e-07,
6845
+ "loss": 0.7057,
6846
+ "step": 9550
6847
+ },
6848
+ {
6849
+ "epoch": 0.24,
6850
+ "grad_norm": 4.696636199951172,
6851
+ "learning_rate": 9.263157894736844e-07,
6852
+ "loss": 0.7853,
6853
+ "step": 9560
6854
+ },
6855
+ {
6856
+ "epoch": 0.24,
6857
+ "grad_norm": 4.211262226104736,
6858
+ "learning_rate": 9.05263157894737e-07,
6859
+ "loss": 0.6612,
6860
+ "step": 9570
6861
+ },
6862
+ {
6863
+ "epoch": 0.24,
6864
+ "grad_norm": 4.584897041320801,
6865
+ "learning_rate": 8.842105263157895e-07,
6866
+ "loss": 0.6393,
6867
+ "step": 9580
6868
+ },
6869
+ {
6870
+ "epoch": 0.24,
6871
+ "grad_norm": 4.64282751083374,
6872
+ "learning_rate": 8.631578947368421e-07,
6873
+ "loss": 0.7915,
6874
+ "step": 9590
6875
+ },
6876
+ {
6877
+ "epoch": 0.24,
6878
+ "grad_norm": 3.691389799118042,
6879
+ "learning_rate": 8.421052631578948e-07,
6880
+ "loss": 0.659,
6881
+ "step": 9600
6882
+ },
6883
+ {
6884
+ "epoch": 0.24,
6885
+ "grad_norm": 4.740243911743164,
6886
+ "learning_rate": 8.210526315789474e-07,
6887
+ "loss": 0.7134,
6888
+ "step": 9610
6889
+ },
6890
+ {
6891
+ "epoch": 0.24,
6892
+ "grad_norm": 6.811493873596191,
6893
+ "learning_rate": 8.000000000000001e-07,
6894
+ "loss": 0.8592,
6895
+ "step": 9620
6896
+ },
6897
+ {
6898
+ "epoch": 0.24,
6899
+ "grad_norm": 3.2056334018707275,
6900
+ "learning_rate": 7.789473684210527e-07,
6901
+ "loss": 0.6753,
6902
+ "step": 9630
6903
+ },
6904
+ {
6905
+ "epoch": 0.24,
6906
+ "grad_norm": 4.347885608673096,
6907
+ "learning_rate": 7.578947368421054e-07,
6908
+ "loss": 0.7476,
6909
+ "step": 9640
6910
+ },
6911
+ {
6912
+ "epoch": 0.24,
6913
+ "grad_norm": 5.63771915435791,
6914
+ "learning_rate": 7.368421052631579e-07,
6915
+ "loss": 0.7649,
6916
+ "step": 9650
6917
+ },
6918
+ {
6919
+ "epoch": 0.24,
6920
+ "grad_norm": 3.062124013900757,
6921
+ "learning_rate": 7.157894736842106e-07,
6922
+ "loss": 0.6792,
6923
+ "step": 9660
6924
+ },
6925
+ {
6926
+ "epoch": 0.24,
6927
+ "grad_norm": 9.334321022033691,
6928
+ "learning_rate": 6.947368421052631e-07,
6929
+ "loss": 0.7626,
6930
+ "step": 9670
6931
+ },
6932
+ {
6933
+ "epoch": 0.24,
6934
+ "grad_norm": 7.429685115814209,
6935
+ "learning_rate": 6.736842105263158e-07,
6936
+ "loss": 0.6943,
6937
+ "step": 9680
6938
+ },
6939
+ {
6940
+ "epoch": 0.24,
6941
+ "grad_norm": 4.459277629852295,
6942
+ "learning_rate": 6.526315789473684e-07,
6943
+ "loss": 0.7838,
6944
+ "step": 9690
6945
+ },
6946
+ {
6947
+ "epoch": 0.24,
6948
+ "grad_norm": 6.821927070617676,
6949
+ "learning_rate": 6.315789473684211e-07,
6950
+ "loss": 0.7103,
6951
+ "step": 9700
6952
+ },
6953
+ {
6954
+ "epoch": 0.24,
6955
+ "grad_norm": 10.438909530639648,
6956
+ "learning_rate": 6.105263157894738e-07,
6957
+ "loss": 0.7509,
6958
+ "step": 9710
6959
+ },
6960
+ {
6961
+ "epoch": 0.24,
6962
+ "grad_norm": 11.55811882019043,
6963
+ "learning_rate": 5.894736842105263e-07,
6964
+ "loss": 0.7623,
6965
+ "step": 9720
6966
+ },
6967
+ {
6968
+ "epoch": 0.24,
6969
+ "grad_norm": 3.1809043884277344,
6970
+ "learning_rate": 5.68421052631579e-07,
6971
+ "loss": 0.6294,
6972
+ "step": 9730
6973
+ },
6974
+ {
6975
+ "epoch": 0.24,
6976
+ "grad_norm": 5.337337970733643,
6977
+ "learning_rate": 5.473684210526316e-07,
6978
+ "loss": 0.763,
6979
+ "step": 9740
6980
+ },
6981
+ {
6982
+ "epoch": 0.24,
6983
+ "grad_norm": 8.130523681640625,
6984
+ "learning_rate": 5.263157894736843e-07,
6985
+ "loss": 0.6404,
6986
+ "step": 9750
6987
+ },
6988
+ {
6989
+ "epoch": 0.24,
6990
+ "grad_norm": 4.213668346405029,
6991
+ "learning_rate": 5.052631578947369e-07,
6992
+ "loss": 0.7379,
6993
+ "step": 9760
6994
+ },
6995
+ {
6996
+ "epoch": 0.24,
6997
+ "grad_norm": 3.8605246543884277,
6998
+ "learning_rate": 4.842105263157895e-07,
6999
+ "loss": 0.7483,
7000
+ "step": 9770
7001
+ },
7002
+ {
7003
+ "epoch": 0.24,
7004
+ "grad_norm": 4.358519077301025,
7005
+ "learning_rate": 4.631578947368422e-07,
7006
+ "loss": 0.6823,
7007
+ "step": 9780
7008
+ },
7009
+ {
7010
+ "epoch": 0.24,
7011
+ "grad_norm": 2.9712955951690674,
7012
+ "learning_rate": 4.421052631578947e-07,
7013
+ "loss": 0.679,
7014
+ "step": 9790
7015
+ },
7016
+ {
7017
+ "epoch": 0.24,
7018
+ "grad_norm": 6.285613059997559,
7019
+ "learning_rate": 4.210526315789474e-07,
7020
+ "loss": 0.7763,
7021
+ "step": 9800
7022
+ },
7023
+ {
7024
+ "epoch": 0.25,
7025
+ "grad_norm": 2.434277296066284,
7026
+ "learning_rate": 4.0000000000000003e-07,
7027
+ "loss": 0.8558,
7028
+ "step": 9810
7029
+ },
7030
+ {
7031
+ "epoch": 0.25,
7032
+ "grad_norm": 7.880703449249268,
7033
+ "learning_rate": 3.789473684210527e-07,
7034
+ "loss": 0.7494,
7035
+ "step": 9820
7036
+ },
7037
+ {
7038
+ "epoch": 0.25,
7039
+ "grad_norm": 11.698799133300781,
7040
+ "learning_rate": 3.578947368421053e-07,
7041
+ "loss": 0.6576,
7042
+ "step": 9830
7043
+ },
7044
+ {
7045
+ "epoch": 0.25,
7046
+ "grad_norm": 3.2752954959869385,
7047
+ "learning_rate": 3.368421052631579e-07,
7048
+ "loss": 0.6494,
7049
+ "step": 9840
7050
+ },
7051
+ {
7052
+ "epoch": 0.25,
7053
+ "grad_norm": 2.878567934036255,
7054
+ "learning_rate": 3.1578947368421055e-07,
7055
+ "loss": 0.6781,
7056
+ "step": 9850
7057
+ },
7058
+ {
7059
+ "epoch": 0.25,
7060
+ "grad_norm": 3.6086246967315674,
7061
+ "learning_rate": 2.9473684210526315e-07,
7062
+ "loss": 0.7339,
7063
+ "step": 9860
7064
+ },
7065
+ {
7066
+ "epoch": 0.25,
7067
+ "grad_norm": 5.403782844543457,
7068
+ "learning_rate": 2.736842105263158e-07,
7069
+ "loss": 0.7738,
7070
+ "step": 9870
7071
+ },
7072
+ {
7073
+ "epoch": 0.25,
7074
+ "grad_norm": 4.487565994262695,
7075
+ "learning_rate": 2.5263157894736846e-07,
7076
+ "loss": 0.8165,
7077
+ "step": 9880
7078
+ },
7079
+ {
7080
+ "epoch": 0.25,
7081
+ "grad_norm": 4.29118537902832,
7082
+ "learning_rate": 2.315789473684211e-07,
7083
+ "loss": 0.6272,
7084
+ "step": 9890
7085
+ },
7086
+ {
7087
+ "epoch": 0.25,
7088
+ "grad_norm": 3.634309768676758,
7089
+ "learning_rate": 2.105263157894737e-07,
7090
+ "loss": 0.6641,
7091
+ "step": 9900
7092
+ },
7093
+ {
7094
+ "epoch": 0.25,
7095
+ "grad_norm": 4.989073276519775,
7096
+ "learning_rate": 1.8947368421052634e-07,
7097
+ "loss": 0.7111,
7098
+ "step": 9910
7099
+ },
7100
+ {
7101
+ "epoch": 0.25,
7102
+ "grad_norm": 5.606556415557861,
7103
+ "learning_rate": 1.6842105263157895e-07,
7104
+ "loss": 0.6112,
7105
+ "step": 9920
7106
+ },
7107
+ {
7108
+ "epoch": 0.25,
7109
+ "grad_norm": 5.012443542480469,
7110
+ "learning_rate": 1.4736842105263158e-07,
7111
+ "loss": 0.6684,
7112
+ "step": 9930
7113
+ },
7114
+ {
7115
+ "epoch": 0.25,
7116
+ "grad_norm": 6.287766933441162,
7117
+ "learning_rate": 1.2631578947368423e-07,
7118
+ "loss": 0.6687,
7119
+ "step": 9940
7120
+ },
7121
+ {
7122
+ "epoch": 0.25,
7123
+ "grad_norm": 3.646402597427368,
7124
+ "learning_rate": 1.0526315789473685e-07,
7125
+ "loss": 0.6452,
7126
+ "step": 9950
7127
+ },
7128
+ {
7129
+ "epoch": 0.25,
7130
+ "grad_norm": 7.9046950340271,
7131
+ "learning_rate": 8.421052631578947e-08,
7132
+ "loss": 0.7636,
7133
+ "step": 9960
7134
+ },
7135
+ {
7136
+ "epoch": 0.25,
7137
+ "grad_norm": 4.733578681945801,
7138
+ "learning_rate": 6.315789473684211e-08,
7139
+ "loss": 0.6619,
7140
+ "step": 9970
7141
+ },
7142
+ {
7143
+ "epoch": 0.25,
7144
+ "grad_norm": 2.342442274093628,
7145
+ "learning_rate": 4.2105263157894737e-08,
7146
+ "loss": 0.74,
7147
+ "step": 9980
7148
+ },
7149
+ {
7150
+ "epoch": 0.25,
7151
+ "grad_norm": 4.0832839012146,
7152
+ "learning_rate": 2.1052631578947368e-08,
7153
+ "loss": 0.7314,
7154
+ "step": 9990
7155
+ },
7156
+ {
7157
+ "epoch": 0.25,
7158
+ "grad_norm": 2.517941951751709,
7159
+ "learning_rate": 0.0,
7160
+ "loss": 0.755,
7161
+ "step": 10000
7162
+ },
7163
+ {
7164
+ "epoch": 0.25,
7165
+ "eval_loss": 0.7402730584144592,
7166
+ "eval_runtime": 67.899,
7167
+ "eval_samples_per_second": 14.728,
7168
+ "eval_steps_per_second": 14.728,
7169
+ "step": 10000
7170
  }
7171
  ],
7172
  "logging_steps": 10,
 
7174
  "num_input_tokens_seen": 0,
7175
  "num_train_epochs": 1,
7176
  "save_steps": 2500,
7177
+ "total_flos": 1.6102125993984e+17,
7178
  "train_batch_size": 1,
7179
  "trial_name": null,
7180
  "trial_params": null