jdannem6 commited on
Commit
648c0e7
·
verified ·
1 Parent(s): c314172

Uploaded checkpoint-27500

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1795 -5
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b60dbd1f94dabd67771a7dbb23e311620b79fcdb1484984d36ba1bd3a7cde11f
3
  size 119975656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:139f4435bd5729787408c5615feba1fe9895d2e7bd0f5d89d9346fdd1ac87574
3
  size 119975656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1f9cebeee181ccfcd69f58bc24273d88b37695243d306a370f1d624a8e393f7
3
  size 240145026
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:018c500e950751508e9ad4d41e38708e2cf3bf2e66584879f67e33a3a31f85fa
3
  size 240145026
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:091e95eb012003506d5f3742551610a86c48243d82f2a20e48d123c1d007d367
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09598c5aaf58b4976874761472c065fd04118ddf756eeecbee75a092b841ae97
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0769304697a92d05b3f54a364ac1e52204140fdb95fd093b56a8d6138f45860
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30c1646ffd4f2e4e86a7c5c87af0949f3be46b7539d2a0137b1bb01bf3e8bbe5
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.7771185040473938,
3
- "best_model_checkpoint": "runs/deepseek_lora_20240423-223943/checkpoint-5000",
4
- "epoch": 0.125,
5
  "eval_steps": 500,
6
- "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3587,6 +3587,1796 @@
3587
  "eval_samples_per_second": 14.708,
3588
  "eval_steps_per_second": 14.708,
3589
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3590
  }
3591
  ],
3592
  "logging_steps": 10,
@@ -3594,7 +5384,7 @@
3594
  "num_input_tokens_seen": 0,
3595
  "num_train_epochs": 1,
3596
  "save_steps": 2500,
3597
- "total_flos": 8.051062996992e+16,
3598
  "train_batch_size": 1,
3599
  "trial_name": null,
3600
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7507393956184387,
3
+ "best_model_checkpoint": "runs/deepseek_lora_20240423-223943/checkpoint-7500",
4
+ "epoch": 0.1875,
5
  "eval_steps": 500,
6
+ "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3587
  "eval_samples_per_second": 14.708,
3588
  "eval_steps_per_second": 14.708,
3589
  "step": 5000
3590
+ },
3591
+ {
3592
+ "epoch": 0.13,
3593
+ "grad_norm": 3.4830055236816406,
3594
+ "learning_rate": 1.0505263157894739e-05,
3595
+ "loss": 0.9099,
3596
+ "step": 5010
3597
+ },
3598
+ {
3599
+ "epoch": 0.13,
3600
+ "grad_norm": 5.7392096519470215,
3601
+ "learning_rate": 1.048421052631579e-05,
3602
+ "loss": 0.6423,
3603
+ "step": 5020
3604
+ },
3605
+ {
3606
+ "epoch": 0.13,
3607
+ "grad_norm": 2.720612049102783,
3608
+ "learning_rate": 1.0463157894736844e-05,
3609
+ "loss": 0.7826,
3610
+ "step": 5030
3611
+ },
3612
+ {
3613
+ "epoch": 0.13,
3614
+ "grad_norm": 3.0437145233154297,
3615
+ "learning_rate": 1.0442105263157895e-05,
3616
+ "loss": 0.7865,
3617
+ "step": 5040
3618
+ },
3619
+ {
3620
+ "epoch": 0.13,
3621
+ "grad_norm": 8.835311889648438,
3622
+ "learning_rate": 1.0421052631578948e-05,
3623
+ "loss": 0.7778,
3624
+ "step": 5050
3625
+ },
3626
+ {
3627
+ "epoch": 0.13,
3628
+ "grad_norm": 7.596973419189453,
3629
+ "learning_rate": 1.04e-05,
3630
+ "loss": 0.7381,
3631
+ "step": 5060
3632
+ },
3633
+ {
3634
+ "epoch": 0.13,
3635
+ "grad_norm": 4.108314037322998,
3636
+ "learning_rate": 1.0378947368421053e-05,
3637
+ "loss": 0.7689,
3638
+ "step": 5070
3639
+ },
3640
+ {
3641
+ "epoch": 0.13,
3642
+ "grad_norm": 3.865196704864502,
3643
+ "learning_rate": 1.0357894736842107e-05,
3644
+ "loss": 0.7785,
3645
+ "step": 5080
3646
+ },
3647
+ {
3648
+ "epoch": 0.13,
3649
+ "grad_norm": 3.4403493404388428,
3650
+ "learning_rate": 1.0336842105263158e-05,
3651
+ "loss": 0.8322,
3652
+ "step": 5090
3653
+ },
3654
+ {
3655
+ "epoch": 0.13,
3656
+ "grad_norm": 3.243029832839966,
3657
+ "learning_rate": 1.0315789473684213e-05,
3658
+ "loss": 0.6658,
3659
+ "step": 5100
3660
+ },
3661
+ {
3662
+ "epoch": 0.13,
3663
+ "grad_norm": 3.806818962097168,
3664
+ "learning_rate": 1.0294736842105264e-05,
3665
+ "loss": 0.781,
3666
+ "step": 5110
3667
+ },
3668
+ {
3669
+ "epoch": 0.13,
3670
+ "grad_norm": 3.820622205734253,
3671
+ "learning_rate": 1.0273684210526316e-05,
3672
+ "loss": 0.7499,
3673
+ "step": 5120
3674
+ },
3675
+ {
3676
+ "epoch": 0.13,
3677
+ "grad_norm": 4.203964710235596,
3678
+ "learning_rate": 1.0252631578947369e-05,
3679
+ "loss": 0.7702,
3680
+ "step": 5130
3681
+ },
3682
+ {
3683
+ "epoch": 0.13,
3684
+ "grad_norm": 2.803215503692627,
3685
+ "learning_rate": 1.0231578947368422e-05,
3686
+ "loss": 0.6291,
3687
+ "step": 5140
3688
+ },
3689
+ {
3690
+ "epoch": 0.13,
3691
+ "grad_norm": 5.486114978790283,
3692
+ "learning_rate": 1.0210526315789476e-05,
3693
+ "loss": 0.8124,
3694
+ "step": 5150
3695
+ },
3696
+ {
3697
+ "epoch": 0.13,
3698
+ "grad_norm": 7.74938440322876,
3699
+ "learning_rate": 1.0189473684210527e-05,
3700
+ "loss": 0.7735,
3701
+ "step": 5160
3702
+ },
3703
+ {
3704
+ "epoch": 0.13,
3705
+ "grad_norm": 4.10128116607666,
3706
+ "learning_rate": 1.0168421052631581e-05,
3707
+ "loss": 0.6809,
3708
+ "step": 5170
3709
+ },
3710
+ {
3711
+ "epoch": 0.13,
3712
+ "grad_norm": 6.844088554382324,
3713
+ "learning_rate": 1.0147368421052632e-05,
3714
+ "loss": 0.8294,
3715
+ "step": 5180
3716
+ },
3717
+ {
3718
+ "epoch": 0.13,
3719
+ "grad_norm": 4.329681873321533,
3720
+ "learning_rate": 1.0126315789473685e-05,
3721
+ "loss": 0.861,
3722
+ "step": 5190
3723
+ },
3724
+ {
3725
+ "epoch": 0.13,
3726
+ "grad_norm": 12.482446670532227,
3727
+ "learning_rate": 1.0105263157894738e-05,
3728
+ "loss": 0.7346,
3729
+ "step": 5200
3730
+ },
3731
+ {
3732
+ "epoch": 0.13,
3733
+ "grad_norm": 1.8471055030822754,
3734
+ "learning_rate": 1.008421052631579e-05,
3735
+ "loss": 0.7714,
3736
+ "step": 5210
3737
+ },
3738
+ {
3739
+ "epoch": 0.13,
3740
+ "grad_norm": 3.1509273052215576,
3741
+ "learning_rate": 1.0063157894736843e-05,
3742
+ "loss": 0.697,
3743
+ "step": 5220
3744
+ },
3745
+ {
3746
+ "epoch": 0.13,
3747
+ "grad_norm": 4.524876117706299,
3748
+ "learning_rate": 1.0042105263157896e-05,
3749
+ "loss": 0.8373,
3750
+ "step": 5230
3751
+ },
3752
+ {
3753
+ "epoch": 0.13,
3754
+ "grad_norm": 2.7305006980895996,
3755
+ "learning_rate": 1.002105263157895e-05,
3756
+ "loss": 0.7182,
3757
+ "step": 5240
3758
+ },
3759
+ {
3760
+ "epoch": 0.13,
3761
+ "grad_norm": 2.5194203853607178,
3762
+ "learning_rate": 1e-05,
3763
+ "loss": 0.794,
3764
+ "step": 5250
3765
+ },
3766
+ {
3767
+ "epoch": 0.13,
3768
+ "grad_norm": 14.967845916748047,
3769
+ "learning_rate": 9.978947368421053e-06,
3770
+ "loss": 0.7564,
3771
+ "step": 5260
3772
+ },
3773
+ {
3774
+ "epoch": 0.13,
3775
+ "grad_norm": 1.8730751276016235,
3776
+ "learning_rate": 9.957894736842106e-06,
3777
+ "loss": 0.726,
3778
+ "step": 5270
3779
+ },
3780
+ {
3781
+ "epoch": 0.13,
3782
+ "grad_norm": 2.1793789863586426,
3783
+ "learning_rate": 9.936842105263159e-06,
3784
+ "loss": 0.7019,
3785
+ "step": 5280
3786
+ },
3787
+ {
3788
+ "epoch": 0.13,
3789
+ "grad_norm": 5.0785651206970215,
3790
+ "learning_rate": 9.915789473684211e-06,
3791
+ "loss": 0.7771,
3792
+ "step": 5290
3793
+ },
3794
+ {
3795
+ "epoch": 0.13,
3796
+ "grad_norm": 9.810837745666504,
3797
+ "learning_rate": 9.894736842105264e-06,
3798
+ "loss": 0.7542,
3799
+ "step": 5300
3800
+ },
3801
+ {
3802
+ "epoch": 0.13,
3803
+ "grad_norm": 24.654855728149414,
3804
+ "learning_rate": 9.873684210526317e-06,
3805
+ "loss": 0.7928,
3806
+ "step": 5310
3807
+ },
3808
+ {
3809
+ "epoch": 0.13,
3810
+ "grad_norm": 3.083669424057007,
3811
+ "learning_rate": 9.85263157894737e-06,
3812
+ "loss": 0.8091,
3813
+ "step": 5320
3814
+ },
3815
+ {
3816
+ "epoch": 0.13,
3817
+ "grad_norm": 3.9507665634155273,
3818
+ "learning_rate": 9.831578947368422e-06,
3819
+ "loss": 0.7548,
3820
+ "step": 5330
3821
+ },
3822
+ {
3823
+ "epoch": 0.13,
3824
+ "grad_norm": 2.55362606048584,
3825
+ "learning_rate": 9.810526315789475e-06,
3826
+ "loss": 0.7804,
3827
+ "step": 5340
3828
+ },
3829
+ {
3830
+ "epoch": 0.13,
3831
+ "grad_norm": 3.572410821914673,
3832
+ "learning_rate": 9.789473684210527e-06,
3833
+ "loss": 0.748,
3834
+ "step": 5350
3835
+ },
3836
+ {
3837
+ "epoch": 0.13,
3838
+ "grad_norm": 3.70060658454895,
3839
+ "learning_rate": 9.76842105263158e-06,
3840
+ "loss": 0.7303,
3841
+ "step": 5360
3842
+ },
3843
+ {
3844
+ "epoch": 0.13,
3845
+ "grad_norm": 3.397512674331665,
3846
+ "learning_rate": 9.747368421052633e-06,
3847
+ "loss": 0.7209,
3848
+ "step": 5370
3849
+ },
3850
+ {
3851
+ "epoch": 0.13,
3852
+ "grad_norm": 2.797943592071533,
3853
+ "learning_rate": 9.726315789473685e-06,
3854
+ "loss": 0.9082,
3855
+ "step": 5380
3856
+ },
3857
+ {
3858
+ "epoch": 0.13,
3859
+ "grad_norm": 9.164168357849121,
3860
+ "learning_rate": 9.705263157894738e-06,
3861
+ "loss": 0.7995,
3862
+ "step": 5390
3863
+ },
3864
+ {
3865
+ "epoch": 0.14,
3866
+ "grad_norm": 6.297326564788818,
3867
+ "learning_rate": 9.68421052631579e-06,
3868
+ "loss": 0.7484,
3869
+ "step": 5400
3870
+ },
3871
+ {
3872
+ "epoch": 0.14,
3873
+ "grad_norm": 12.500905990600586,
3874
+ "learning_rate": 9.663157894736843e-06,
3875
+ "loss": 0.7291,
3876
+ "step": 5410
3877
+ },
3878
+ {
3879
+ "epoch": 0.14,
3880
+ "grad_norm": 3.1083016395568848,
3881
+ "learning_rate": 9.642105263157896e-06,
3882
+ "loss": 0.8064,
3883
+ "step": 5420
3884
+ },
3885
+ {
3886
+ "epoch": 0.14,
3887
+ "grad_norm": 4.058903694152832,
3888
+ "learning_rate": 9.621052631578947e-06,
3889
+ "loss": 0.7087,
3890
+ "step": 5430
3891
+ },
3892
+ {
3893
+ "epoch": 0.14,
3894
+ "grad_norm": 5.303778648376465,
3895
+ "learning_rate": 9.600000000000001e-06,
3896
+ "loss": 0.6257,
3897
+ "step": 5440
3898
+ },
3899
+ {
3900
+ "epoch": 0.14,
3901
+ "grad_norm": 2.8508620262145996,
3902
+ "learning_rate": 9.578947368421054e-06,
3903
+ "loss": 0.7423,
3904
+ "step": 5450
3905
+ },
3906
+ {
3907
+ "epoch": 0.14,
3908
+ "grad_norm": 5.9560956954956055,
3909
+ "learning_rate": 9.557894736842107e-06,
3910
+ "loss": 0.7304,
3911
+ "step": 5460
3912
+ },
3913
+ {
3914
+ "epoch": 0.14,
3915
+ "grad_norm": 2.8841540813446045,
3916
+ "learning_rate": 9.53684210526316e-06,
3917
+ "loss": 0.7768,
3918
+ "step": 5470
3919
+ },
3920
+ {
3921
+ "epoch": 0.14,
3922
+ "grad_norm": 2.6742358207702637,
3923
+ "learning_rate": 9.515789473684212e-06,
3924
+ "loss": 0.7618,
3925
+ "step": 5480
3926
+ },
3927
+ {
3928
+ "epoch": 0.14,
3929
+ "grad_norm": 4.105114936828613,
3930
+ "learning_rate": 9.494736842105265e-06,
3931
+ "loss": 0.7086,
3932
+ "step": 5490
3933
+ },
3934
+ {
3935
+ "epoch": 0.14,
3936
+ "grad_norm": 4.728137493133545,
3937
+ "learning_rate": 9.473684210526315e-06,
3938
+ "loss": 0.8313,
3939
+ "step": 5500
3940
+ },
3941
+ {
3942
+ "epoch": 0.14,
3943
+ "eval_loss": 0.7711445689201355,
3944
+ "eval_runtime": 67.9047,
3945
+ "eval_samples_per_second": 14.727,
3946
+ "eval_steps_per_second": 14.727,
3947
+ "step": 5500
3948
+ },
3949
+ {
3950
+ "epoch": 0.14,
3951
+ "grad_norm": 4.539173603057861,
3952
+ "learning_rate": 9.452631578947368e-06,
3953
+ "loss": 0.7231,
3954
+ "step": 5510
3955
+ },
3956
+ {
3957
+ "epoch": 0.14,
3958
+ "grad_norm": 4.742118835449219,
3959
+ "learning_rate": 9.43157894736842e-06,
3960
+ "loss": 0.8199,
3961
+ "step": 5520
3962
+ },
3963
+ {
3964
+ "epoch": 0.14,
3965
+ "grad_norm": 5.9068603515625,
3966
+ "learning_rate": 9.410526315789475e-06,
3967
+ "loss": 0.7615,
3968
+ "step": 5530
3969
+ },
3970
+ {
3971
+ "epoch": 0.14,
3972
+ "grad_norm": 7.106772422790527,
3973
+ "learning_rate": 9.389473684210528e-06,
3974
+ "loss": 0.7139,
3975
+ "step": 5540
3976
+ },
3977
+ {
3978
+ "epoch": 0.14,
3979
+ "grad_norm": 2.272012710571289,
3980
+ "learning_rate": 9.36842105263158e-06,
3981
+ "loss": 0.6264,
3982
+ "step": 5550
3983
+ },
3984
+ {
3985
+ "epoch": 0.14,
3986
+ "grad_norm": 14.025699615478516,
3987
+ "learning_rate": 9.347368421052633e-06,
3988
+ "loss": 0.7416,
3989
+ "step": 5560
3990
+ },
3991
+ {
3992
+ "epoch": 0.14,
3993
+ "grad_norm": 12.747345924377441,
3994
+ "learning_rate": 9.326315789473684e-06,
3995
+ "loss": 0.781,
3996
+ "step": 5570
3997
+ },
3998
+ {
3999
+ "epoch": 0.14,
4000
+ "grad_norm": 7.966195106506348,
4001
+ "learning_rate": 9.305263157894737e-06,
4002
+ "loss": 0.7503,
4003
+ "step": 5580
4004
+ },
4005
+ {
4006
+ "epoch": 0.14,
4007
+ "grad_norm": 3.3705811500549316,
4008
+ "learning_rate": 9.28421052631579e-06,
4009
+ "loss": 0.7704,
4010
+ "step": 5590
4011
+ },
4012
+ {
4013
+ "epoch": 0.14,
4014
+ "grad_norm": 5.239542007446289,
4015
+ "learning_rate": 9.263157894736842e-06,
4016
+ "loss": 0.6806,
4017
+ "step": 5600
4018
+ },
4019
+ {
4020
+ "epoch": 0.14,
4021
+ "grad_norm": 6.395047187805176,
4022
+ "learning_rate": 9.242105263157896e-06,
4023
+ "loss": 0.6961,
4024
+ "step": 5610
4025
+ },
4026
+ {
4027
+ "epoch": 0.14,
4028
+ "grad_norm": 3.807992458343506,
4029
+ "learning_rate": 9.221052631578949e-06,
4030
+ "loss": 0.769,
4031
+ "step": 5620
4032
+ },
4033
+ {
4034
+ "epoch": 0.14,
4035
+ "grad_norm": 3.8179049491882324,
4036
+ "learning_rate": 9.200000000000002e-06,
4037
+ "loss": 0.7515,
4038
+ "step": 5630
4039
+ },
4040
+ {
4041
+ "epoch": 0.14,
4042
+ "grad_norm": 4.826687812805176,
4043
+ "learning_rate": 9.178947368421053e-06,
4044
+ "loss": 0.7337,
4045
+ "step": 5640
4046
+ },
4047
+ {
4048
+ "epoch": 0.14,
4049
+ "grad_norm": 4.776168346405029,
4050
+ "learning_rate": 9.157894736842105e-06,
4051
+ "loss": 0.7173,
4052
+ "step": 5650
4053
+ },
4054
+ {
4055
+ "epoch": 0.14,
4056
+ "grad_norm": 4.10529088973999,
4057
+ "learning_rate": 9.136842105263158e-06,
4058
+ "loss": 0.7255,
4059
+ "step": 5660
4060
+ },
4061
+ {
4062
+ "epoch": 0.14,
4063
+ "grad_norm": 5.4715189933776855,
4064
+ "learning_rate": 9.11578947368421e-06,
4065
+ "loss": 0.8092,
4066
+ "step": 5670
4067
+ },
4068
+ {
4069
+ "epoch": 0.14,
4070
+ "grad_norm": 3.8921728134155273,
4071
+ "learning_rate": 9.094736842105263e-06,
4072
+ "loss": 0.6684,
4073
+ "step": 5680
4074
+ },
4075
+ {
4076
+ "epoch": 0.14,
4077
+ "grad_norm": 5.904684066772461,
4078
+ "learning_rate": 9.073684210526316e-06,
4079
+ "loss": 0.7804,
4080
+ "step": 5690
4081
+ },
4082
+ {
4083
+ "epoch": 0.14,
4084
+ "grad_norm": 9.521209716796875,
4085
+ "learning_rate": 9.05263157894737e-06,
4086
+ "loss": 0.793,
4087
+ "step": 5700
4088
+ },
4089
+ {
4090
+ "epoch": 0.14,
4091
+ "grad_norm": 11.125286102294922,
4092
+ "learning_rate": 9.031578947368423e-06,
4093
+ "loss": 0.8254,
4094
+ "step": 5710
4095
+ },
4096
+ {
4097
+ "epoch": 0.14,
4098
+ "grad_norm": 8.136049270629883,
4099
+ "learning_rate": 9.010526315789474e-06,
4100
+ "loss": 0.7475,
4101
+ "step": 5720
4102
+ },
4103
+ {
4104
+ "epoch": 0.14,
4105
+ "grad_norm": 2.4722092151641846,
4106
+ "learning_rate": 8.989473684210527e-06,
4107
+ "loss": 0.7268,
4108
+ "step": 5730
4109
+ },
4110
+ {
4111
+ "epoch": 0.14,
4112
+ "grad_norm": 3.330580711364746,
4113
+ "learning_rate": 8.96842105263158e-06,
4114
+ "loss": 0.7995,
4115
+ "step": 5740
4116
+ },
4117
+ {
4118
+ "epoch": 0.14,
4119
+ "grad_norm": 25.711868286132812,
4120
+ "learning_rate": 8.947368421052632e-06,
4121
+ "loss": 0.801,
4122
+ "step": 5750
4123
+ },
4124
+ {
4125
+ "epoch": 0.14,
4126
+ "grad_norm": 2.3957395553588867,
4127
+ "learning_rate": 8.926315789473685e-06,
4128
+ "loss": 0.6988,
4129
+ "step": 5760
4130
+ },
4131
+ {
4132
+ "epoch": 0.14,
4133
+ "grad_norm": 3.033153533935547,
4134
+ "learning_rate": 8.905263157894737e-06,
4135
+ "loss": 0.7378,
4136
+ "step": 5770
4137
+ },
4138
+ {
4139
+ "epoch": 0.14,
4140
+ "grad_norm": 4.359398365020752,
4141
+ "learning_rate": 8.884210526315792e-06,
4142
+ "loss": 0.7214,
4143
+ "step": 5780
4144
+ },
4145
+ {
4146
+ "epoch": 0.14,
4147
+ "grad_norm": 3.08485746383667,
4148
+ "learning_rate": 8.863157894736842e-06,
4149
+ "loss": 0.7034,
4150
+ "step": 5790
4151
+ },
4152
+ {
4153
+ "epoch": 0.14,
4154
+ "grad_norm": 4.156674385070801,
4155
+ "learning_rate": 8.842105263157895e-06,
4156
+ "loss": 0.7833,
4157
+ "step": 5800
4158
+ },
4159
+ {
4160
+ "epoch": 0.15,
4161
+ "grad_norm": 4.031563758850098,
4162
+ "learning_rate": 8.821052631578948e-06,
4163
+ "loss": 0.7385,
4164
+ "step": 5810
4165
+ },
4166
+ {
4167
+ "epoch": 0.15,
4168
+ "grad_norm": 9.957317352294922,
4169
+ "learning_rate": 8.8e-06,
4170
+ "loss": 0.8572,
4171
+ "step": 5820
4172
+ },
4173
+ {
4174
+ "epoch": 0.15,
4175
+ "grad_norm": 3.951910972595215,
4176
+ "learning_rate": 8.778947368421053e-06,
4177
+ "loss": 0.7374,
4178
+ "step": 5830
4179
+ },
4180
+ {
4181
+ "epoch": 0.15,
4182
+ "grad_norm": 5.296828746795654,
4183
+ "learning_rate": 8.757894736842106e-06,
4184
+ "loss": 0.7619,
4185
+ "step": 5840
4186
+ },
4187
+ {
4188
+ "epoch": 0.15,
4189
+ "grad_norm": 7.079039096832275,
4190
+ "learning_rate": 8.736842105263158e-06,
4191
+ "loss": 0.7842,
4192
+ "step": 5850
4193
+ },
4194
+ {
4195
+ "epoch": 0.15,
4196
+ "grad_norm": 4.972481727600098,
4197
+ "learning_rate": 8.715789473684211e-06,
4198
+ "loss": 0.7039,
4199
+ "step": 5860
4200
+ },
4201
+ {
4202
+ "epoch": 0.15,
4203
+ "grad_norm": 11.936322212219238,
4204
+ "learning_rate": 8.694736842105264e-06,
4205
+ "loss": 0.6701,
4206
+ "step": 5870
4207
+ },
4208
+ {
4209
+ "epoch": 0.15,
4210
+ "grad_norm": 4.164266586303711,
4211
+ "learning_rate": 8.673684210526316e-06,
4212
+ "loss": 0.7481,
4213
+ "step": 5880
4214
+ },
4215
+ {
4216
+ "epoch": 0.15,
4217
+ "grad_norm": 4.0412397384643555,
4218
+ "learning_rate": 8.652631578947369e-06,
4219
+ "loss": 0.8783,
4220
+ "step": 5890
4221
+ },
4222
+ {
4223
+ "epoch": 0.15,
4224
+ "grad_norm": 13.239718437194824,
4225
+ "learning_rate": 8.631578947368422e-06,
4226
+ "loss": 0.8639,
4227
+ "step": 5900
4228
+ },
4229
+ {
4230
+ "epoch": 0.15,
4231
+ "grad_norm": 5.553131103515625,
4232
+ "learning_rate": 8.610526315789474e-06,
4233
+ "loss": 0.7861,
4234
+ "step": 5910
4235
+ },
4236
+ {
4237
+ "epoch": 0.15,
4238
+ "grad_norm": 4.507501602172852,
4239
+ "learning_rate": 8.589473684210527e-06,
4240
+ "loss": 0.7526,
4241
+ "step": 5920
4242
+ },
4243
+ {
4244
+ "epoch": 0.15,
4245
+ "grad_norm": 3.70124888420105,
4246
+ "learning_rate": 8.56842105263158e-06,
4247
+ "loss": 0.8391,
4248
+ "step": 5930
4249
+ },
4250
+ {
4251
+ "epoch": 0.15,
4252
+ "grad_norm": 4.307315349578857,
4253
+ "learning_rate": 8.547368421052632e-06,
4254
+ "loss": 0.7253,
4255
+ "step": 5940
4256
+ },
4257
+ {
4258
+ "epoch": 0.15,
4259
+ "grad_norm": 12.232582092285156,
4260
+ "learning_rate": 8.526315789473685e-06,
4261
+ "loss": 0.8559,
4262
+ "step": 5950
4263
+ },
4264
+ {
4265
+ "epoch": 0.15,
4266
+ "grad_norm": 3.0924105644226074,
4267
+ "learning_rate": 8.505263157894738e-06,
4268
+ "loss": 0.6245,
4269
+ "step": 5960
4270
+ },
4271
+ {
4272
+ "epoch": 0.15,
4273
+ "grad_norm": 2.90191912651062,
4274
+ "learning_rate": 8.48421052631579e-06,
4275
+ "loss": 0.6643,
4276
+ "step": 5970
4277
+ },
4278
+ {
4279
+ "epoch": 0.15,
4280
+ "grad_norm": 3.4637041091918945,
4281
+ "learning_rate": 8.463157894736843e-06,
4282
+ "loss": 0.72,
4283
+ "step": 5980
4284
+ },
4285
+ {
4286
+ "epoch": 0.15,
4287
+ "grad_norm": 2.8273704051971436,
4288
+ "learning_rate": 8.442105263157896e-06,
4289
+ "loss": 0.7202,
4290
+ "step": 5990
4291
+ },
4292
+ {
4293
+ "epoch": 0.15,
4294
+ "grad_norm": 7.119280815124512,
4295
+ "learning_rate": 8.421052631578948e-06,
4296
+ "loss": 0.7047,
4297
+ "step": 6000
4298
+ },
4299
+ {
4300
+ "epoch": 0.15,
4301
+ "eval_loss": 0.7685219645500183,
4302
+ "eval_runtime": 67.892,
4303
+ "eval_samples_per_second": 14.729,
4304
+ "eval_steps_per_second": 14.729,
4305
+ "step": 6000
4306
+ },
4307
+ {
4308
+ "epoch": 0.15,
4309
+ "grad_norm": 4.9551520347595215,
4310
+ "learning_rate": 8.400000000000001e-06,
4311
+ "loss": 0.6911,
4312
+ "step": 6010
4313
+ },
4314
+ {
4315
+ "epoch": 0.15,
4316
+ "grad_norm": 2.9231200218200684,
4317
+ "learning_rate": 8.378947368421054e-06,
4318
+ "loss": 0.7942,
4319
+ "step": 6020
4320
+ },
4321
+ {
4322
+ "epoch": 0.15,
4323
+ "grad_norm": 7.254823684692383,
4324
+ "learning_rate": 8.357894736842106e-06,
4325
+ "loss": 0.7811,
4326
+ "step": 6030
4327
+ },
4328
+ {
4329
+ "epoch": 0.15,
4330
+ "grad_norm": 3.8563404083251953,
4331
+ "learning_rate": 8.336842105263159e-06,
4332
+ "loss": 0.7523,
4333
+ "step": 6040
4334
+ },
4335
+ {
4336
+ "epoch": 0.15,
4337
+ "grad_norm": 3.5061299800872803,
4338
+ "learning_rate": 8.315789473684212e-06,
4339
+ "loss": 0.6222,
4340
+ "step": 6050
4341
+ },
4342
+ {
4343
+ "epoch": 0.15,
4344
+ "grad_norm": 3.3213858604431152,
4345
+ "learning_rate": 8.294736842105264e-06,
4346
+ "loss": 0.7617,
4347
+ "step": 6060
4348
+ },
4349
+ {
4350
+ "epoch": 0.15,
4351
+ "grad_norm": 5.054555416107178,
4352
+ "learning_rate": 8.273684210526317e-06,
4353
+ "loss": 0.7333,
4354
+ "step": 6070
4355
+ },
4356
+ {
4357
+ "epoch": 0.15,
4358
+ "grad_norm": 3.5189318656921387,
4359
+ "learning_rate": 8.25263157894737e-06,
4360
+ "loss": 0.8676,
4361
+ "step": 6080
4362
+ },
4363
+ {
4364
+ "epoch": 0.15,
4365
+ "grad_norm": 4.989790439605713,
4366
+ "learning_rate": 8.231578947368422e-06,
4367
+ "loss": 0.6678,
4368
+ "step": 6090
4369
+ },
4370
+ {
4371
+ "epoch": 0.15,
4372
+ "grad_norm": 7.941010475158691,
4373
+ "learning_rate": 8.210526315789475e-06,
4374
+ "loss": 0.7317,
4375
+ "step": 6100
4376
+ },
4377
+ {
4378
+ "epoch": 0.15,
4379
+ "grad_norm": 6.6499247550964355,
4380
+ "learning_rate": 8.189473684210527e-06,
4381
+ "loss": 0.7484,
4382
+ "step": 6110
4383
+ },
4384
+ {
4385
+ "epoch": 0.15,
4386
+ "grad_norm": 3.512948513031006,
4387
+ "learning_rate": 8.16842105263158e-06,
4388
+ "loss": 0.8508,
4389
+ "step": 6120
4390
+ },
4391
+ {
4392
+ "epoch": 0.15,
4393
+ "grad_norm": 3.844045400619507,
4394
+ "learning_rate": 8.147368421052633e-06,
4395
+ "loss": 0.7468,
4396
+ "step": 6130
4397
+ },
4398
+ {
4399
+ "epoch": 0.15,
4400
+ "grad_norm": 2.620250701904297,
4401
+ "learning_rate": 8.126315789473684e-06,
4402
+ "loss": 0.6449,
4403
+ "step": 6140
4404
+ },
4405
+ {
4406
+ "epoch": 0.15,
4407
+ "grad_norm": 3.5233919620513916,
4408
+ "learning_rate": 8.105263157894736e-06,
4409
+ "loss": 0.7928,
4410
+ "step": 6150
4411
+ },
4412
+ {
4413
+ "epoch": 0.15,
4414
+ "grad_norm": 4.866186618804932,
4415
+ "learning_rate": 8.08421052631579e-06,
4416
+ "loss": 0.787,
4417
+ "step": 6160
4418
+ },
4419
+ {
4420
+ "epoch": 0.15,
4421
+ "grad_norm": 4.392407417297363,
4422
+ "learning_rate": 8.063157894736843e-06,
4423
+ "loss": 0.7746,
4424
+ "step": 6170
4425
+ },
4426
+ {
4427
+ "epoch": 0.15,
4428
+ "grad_norm": 6.6285176277160645,
4429
+ "learning_rate": 8.042105263157896e-06,
4430
+ "loss": 0.7304,
4431
+ "step": 6180
4432
+ },
4433
+ {
4434
+ "epoch": 0.15,
4435
+ "grad_norm": 2.571240186691284,
4436
+ "learning_rate": 8.021052631578949e-06,
4437
+ "loss": 0.7008,
4438
+ "step": 6190
4439
+ },
4440
+ {
4441
+ "epoch": 0.15,
4442
+ "grad_norm": 2.8306283950805664,
4443
+ "learning_rate": 8.000000000000001e-06,
4444
+ "loss": 0.834,
4445
+ "step": 6200
4446
+ },
4447
+ {
4448
+ "epoch": 0.16,
4449
+ "grad_norm": 2.5514955520629883,
4450
+ "learning_rate": 7.978947368421052e-06,
4451
+ "loss": 0.8136,
4452
+ "step": 6210
4453
+ },
4454
+ {
4455
+ "epoch": 0.16,
4456
+ "grad_norm": 8.471675872802734,
4457
+ "learning_rate": 7.957894736842105e-06,
4458
+ "loss": 0.8439,
4459
+ "step": 6220
4460
+ },
4461
+ {
4462
+ "epoch": 0.16,
4463
+ "grad_norm": 8.785553932189941,
4464
+ "learning_rate": 7.936842105263158e-06,
4465
+ "loss": 0.7763,
4466
+ "step": 6230
4467
+ },
4468
+ {
4469
+ "epoch": 0.16,
4470
+ "grad_norm": 5.334304332733154,
4471
+ "learning_rate": 7.915789473684212e-06,
4472
+ "loss": 0.7832,
4473
+ "step": 6240
4474
+ },
4475
+ {
4476
+ "epoch": 0.16,
4477
+ "grad_norm": 14.861701011657715,
4478
+ "learning_rate": 7.894736842105265e-06,
4479
+ "loss": 0.6889,
4480
+ "step": 6250
4481
+ },
4482
+ {
4483
+ "epoch": 0.16,
4484
+ "grad_norm": 2.040034770965576,
4485
+ "learning_rate": 7.873684210526317e-06,
4486
+ "loss": 0.7422,
4487
+ "step": 6260
4488
+ },
4489
+ {
4490
+ "epoch": 0.16,
4491
+ "grad_norm": 9.74354076385498,
4492
+ "learning_rate": 7.85263157894737e-06,
4493
+ "loss": 0.7765,
4494
+ "step": 6270
4495
+ },
4496
+ {
4497
+ "epoch": 0.16,
4498
+ "grad_norm": 3.4280757904052734,
4499
+ "learning_rate": 7.831578947368421e-06,
4500
+ "loss": 0.7465,
4501
+ "step": 6280
4502
+ },
4503
+ {
4504
+ "epoch": 0.16,
4505
+ "grad_norm": 6.530819416046143,
4506
+ "learning_rate": 7.810526315789474e-06,
4507
+ "loss": 0.8216,
4508
+ "step": 6290
4509
+ },
4510
+ {
4511
+ "epoch": 0.16,
4512
+ "grad_norm": 6.786412239074707,
4513
+ "learning_rate": 7.789473684210526e-06,
4514
+ "loss": 0.7694,
4515
+ "step": 6300
4516
+ },
4517
+ {
4518
+ "epoch": 0.16,
4519
+ "grad_norm": 4.896278381347656,
4520
+ "learning_rate": 7.768421052631579e-06,
4521
+ "loss": 0.8282,
4522
+ "step": 6310
4523
+ },
4524
+ {
4525
+ "epoch": 0.16,
4526
+ "grad_norm": 4.5938825607299805,
4527
+ "learning_rate": 7.747368421052631e-06,
4528
+ "loss": 0.6628,
4529
+ "step": 6320
4530
+ },
4531
+ {
4532
+ "epoch": 0.16,
4533
+ "grad_norm": 2.134136915206909,
4534
+ "learning_rate": 7.726315789473686e-06,
4535
+ "loss": 0.8061,
4536
+ "step": 6330
4537
+ },
4538
+ {
4539
+ "epoch": 0.16,
4540
+ "grad_norm": 7.497835159301758,
4541
+ "learning_rate": 7.705263157894738e-06,
4542
+ "loss": 0.8946,
4543
+ "step": 6340
4544
+ },
4545
+ {
4546
+ "epoch": 0.16,
4547
+ "grad_norm": 3.5185306072235107,
4548
+ "learning_rate": 7.68421052631579e-06,
4549
+ "loss": 0.6689,
4550
+ "step": 6350
4551
+ },
4552
+ {
4553
+ "epoch": 0.16,
4554
+ "grad_norm": 2.464015245437622,
4555
+ "learning_rate": 7.663157894736842e-06,
4556
+ "loss": 0.7758,
4557
+ "step": 6360
4558
+ },
4559
+ {
4560
+ "epoch": 0.16,
4561
+ "grad_norm": 2.803342580795288,
4562
+ "learning_rate": 7.642105263157895e-06,
4563
+ "loss": 0.7478,
4564
+ "step": 6370
4565
+ },
4566
+ {
4567
+ "epoch": 0.16,
4568
+ "grad_norm": 6.2652130126953125,
4569
+ "learning_rate": 7.621052631578948e-06,
4570
+ "loss": 0.7293,
4571
+ "step": 6380
4572
+ },
4573
+ {
4574
+ "epoch": 0.16,
4575
+ "grad_norm": 9.655146598815918,
4576
+ "learning_rate": 7.600000000000001e-06,
4577
+ "loss": 0.7454,
4578
+ "step": 6390
4579
+ },
4580
+ {
4581
+ "epoch": 0.16,
4582
+ "grad_norm": 5.041891574859619,
4583
+ "learning_rate": 7.578947368421054e-06,
4584
+ "loss": 0.8579,
4585
+ "step": 6400
4586
+ },
4587
+ {
4588
+ "epoch": 0.16,
4589
+ "grad_norm": 3.133237838745117,
4590
+ "learning_rate": 7.557894736842106e-06,
4591
+ "loss": 0.6662,
4592
+ "step": 6410
4593
+ },
4594
+ {
4595
+ "epoch": 0.16,
4596
+ "grad_norm": 7.207560062408447,
4597
+ "learning_rate": 7.536842105263158e-06,
4598
+ "loss": 0.8135,
4599
+ "step": 6420
4600
+ },
4601
+ {
4602
+ "epoch": 0.16,
4603
+ "grad_norm": 3.374864101409912,
4604
+ "learning_rate": 7.515789473684211e-06,
4605
+ "loss": 0.7514,
4606
+ "step": 6430
4607
+ },
4608
+ {
4609
+ "epoch": 0.16,
4610
+ "grad_norm": 4.067178249359131,
4611
+ "learning_rate": 7.494736842105263e-06,
4612
+ "loss": 0.7446,
4613
+ "step": 6440
4614
+ },
4615
+ {
4616
+ "epoch": 0.16,
4617
+ "grad_norm": 4.283421516418457,
4618
+ "learning_rate": 7.473684210526316e-06,
4619
+ "loss": 0.7955,
4620
+ "step": 6450
4621
+ },
4622
+ {
4623
+ "epoch": 0.16,
4624
+ "grad_norm": 3.092348098754883,
4625
+ "learning_rate": 7.4526315789473695e-06,
4626
+ "loss": 0.5471,
4627
+ "step": 6460
4628
+ },
4629
+ {
4630
+ "epoch": 0.16,
4631
+ "grad_norm": 9.400391578674316,
4632
+ "learning_rate": 7.431578947368422e-06,
4633
+ "loss": 0.7098,
4634
+ "step": 6470
4635
+ },
4636
+ {
4637
+ "epoch": 0.16,
4638
+ "grad_norm": 5.843224048614502,
4639
+ "learning_rate": 7.410526315789475e-06,
4640
+ "loss": 0.7943,
4641
+ "step": 6480
4642
+ },
4643
+ {
4644
+ "epoch": 0.16,
4645
+ "grad_norm": 3.5985705852508545,
4646
+ "learning_rate": 7.3894736842105275e-06,
4647
+ "loss": 0.8059,
4648
+ "step": 6490
4649
+ },
4650
+ {
4651
+ "epoch": 0.16,
4652
+ "grad_norm": 5.502979278564453,
4653
+ "learning_rate": 7.368421052631579e-06,
4654
+ "loss": 0.6236,
4655
+ "step": 6500
4656
+ },
4657
+ {
4658
+ "epoch": 0.16,
4659
+ "eval_loss": 0.7682243585586548,
4660
+ "eval_runtime": 67.9039,
4661
+ "eval_samples_per_second": 14.727,
4662
+ "eval_steps_per_second": 14.727,
4663
+ "step": 6500
4664
+ },
4665
+ {
4666
+ "epoch": 0.16,
4667
+ "grad_norm": 11.025419235229492,
4668
+ "learning_rate": 7.347368421052632e-06,
4669
+ "loss": 0.8343,
4670
+ "step": 6510
4671
+ },
4672
+ {
4673
+ "epoch": 0.16,
4674
+ "grad_norm": 3.4290804862976074,
4675
+ "learning_rate": 7.326315789473685e-06,
4676
+ "loss": 0.7572,
4677
+ "step": 6520
4678
+ },
4679
+ {
4680
+ "epoch": 0.16,
4681
+ "grad_norm": 3.0629210472106934,
4682
+ "learning_rate": 7.305263157894737e-06,
4683
+ "loss": 0.8245,
4684
+ "step": 6530
4685
+ },
4686
+ {
4687
+ "epoch": 0.16,
4688
+ "grad_norm": 5.065977573394775,
4689
+ "learning_rate": 7.28421052631579e-06,
4690
+ "loss": 0.6447,
4691
+ "step": 6540
4692
+ },
4693
+ {
4694
+ "epoch": 0.16,
4695
+ "grad_norm": 3.971541166305542,
4696
+ "learning_rate": 7.263157894736843e-06,
4697
+ "loss": 0.8688,
4698
+ "step": 6550
4699
+ },
4700
+ {
4701
+ "epoch": 0.16,
4702
+ "grad_norm": 3.4434573650360107,
4703
+ "learning_rate": 7.242105263157896e-06,
4704
+ "loss": 0.6749,
4705
+ "step": 6560
4706
+ },
4707
+ {
4708
+ "epoch": 0.16,
4709
+ "grad_norm": 4.323293685913086,
4710
+ "learning_rate": 7.221052631578948e-06,
4711
+ "loss": 0.7982,
4712
+ "step": 6570
4713
+ },
4714
+ {
4715
+ "epoch": 0.16,
4716
+ "grad_norm": 16.821266174316406,
4717
+ "learning_rate": 7.2000000000000005e-06,
4718
+ "loss": 0.7898,
4719
+ "step": 6580
4720
+ },
4721
+ {
4722
+ "epoch": 0.16,
4723
+ "grad_norm": 3.008687734603882,
4724
+ "learning_rate": 7.178947368421053e-06,
4725
+ "loss": 0.7375,
4726
+ "step": 6590
4727
+ },
4728
+ {
4729
+ "epoch": 0.17,
4730
+ "grad_norm": 3.629837989807129,
4731
+ "learning_rate": 7.157894736842106e-06,
4732
+ "loss": 0.7909,
4733
+ "step": 6600
4734
+ },
4735
+ {
4736
+ "epoch": 0.17,
4737
+ "grad_norm": 5.807744026184082,
4738
+ "learning_rate": 7.1368421052631585e-06,
4739
+ "loss": 0.621,
4740
+ "step": 6610
4741
+ },
4742
+ {
4743
+ "epoch": 0.17,
4744
+ "grad_norm": 3.9960129261016846,
4745
+ "learning_rate": 7.115789473684211e-06,
4746
+ "loss": 0.851,
4747
+ "step": 6620
4748
+ },
4749
+ {
4750
+ "epoch": 0.17,
4751
+ "grad_norm": 2.7165372371673584,
4752
+ "learning_rate": 7.094736842105265e-06,
4753
+ "loss": 0.7872,
4754
+ "step": 6630
4755
+ },
4756
+ {
4757
+ "epoch": 0.17,
4758
+ "grad_norm": 5.922586917877197,
4759
+ "learning_rate": 7.073684210526316e-06,
4760
+ "loss": 0.8822,
4761
+ "step": 6640
4762
+ },
4763
+ {
4764
+ "epoch": 0.17,
4765
+ "grad_norm": 9.046282768249512,
4766
+ "learning_rate": 7.052631578947369e-06,
4767
+ "loss": 0.7454,
4768
+ "step": 6650
4769
+ },
4770
+ {
4771
+ "epoch": 0.17,
4772
+ "grad_norm": 4.76317024230957,
4773
+ "learning_rate": 7.031578947368422e-06,
4774
+ "loss": 0.7116,
4775
+ "step": 6660
4776
+ },
4777
+ {
4778
+ "epoch": 0.17,
4779
+ "grad_norm": 4.31531286239624,
4780
+ "learning_rate": 7.010526315789474e-06,
4781
+ "loss": 0.7892,
4782
+ "step": 6670
4783
+ },
4784
+ {
4785
+ "epoch": 0.17,
4786
+ "grad_norm": 3.0895297527313232,
4787
+ "learning_rate": 6.989473684210527e-06,
4788
+ "loss": 0.7095,
4789
+ "step": 6680
4790
+ },
4791
+ {
4792
+ "epoch": 0.17,
4793
+ "grad_norm": 4.174783706665039,
4794
+ "learning_rate": 6.96842105263158e-06,
4795
+ "loss": 0.8007,
4796
+ "step": 6690
4797
+ },
4798
+ {
4799
+ "epoch": 0.17,
4800
+ "grad_norm": 4.1555280685424805,
4801
+ "learning_rate": 6.947368421052632e-06,
4802
+ "loss": 0.8274,
4803
+ "step": 6700
4804
+ },
4805
+ {
4806
+ "epoch": 0.17,
4807
+ "grad_norm": 3.173882246017456,
4808
+ "learning_rate": 6.926315789473684e-06,
4809
+ "loss": 0.6447,
4810
+ "step": 6710
4811
+ },
4812
+ {
4813
+ "epoch": 0.17,
4814
+ "grad_norm": 2.1489410400390625,
4815
+ "learning_rate": 6.905263157894737e-06,
4816
+ "loss": 0.7428,
4817
+ "step": 6720
4818
+ },
4819
+ {
4820
+ "epoch": 0.17,
4821
+ "grad_norm": 2.523904323577881,
4822
+ "learning_rate": 6.8842105263157895e-06,
4823
+ "loss": 0.8159,
4824
+ "step": 6730
4825
+ },
4826
+ {
4827
+ "epoch": 0.17,
4828
+ "grad_norm": 1.7494622468948364,
4829
+ "learning_rate": 6.863157894736843e-06,
4830
+ "loss": 0.863,
4831
+ "step": 6740
4832
+ },
4833
+ {
4834
+ "epoch": 0.17,
4835
+ "grad_norm": 2.552121639251709,
4836
+ "learning_rate": 6.842105263157896e-06,
4837
+ "loss": 0.7448,
4838
+ "step": 6750
4839
+ },
4840
+ {
4841
+ "epoch": 0.17,
4842
+ "grad_norm": 4.1907453536987305,
4843
+ "learning_rate": 6.821052631578948e-06,
4844
+ "loss": 0.6813,
4845
+ "step": 6760
4846
+ },
4847
+ {
4848
+ "epoch": 0.17,
4849
+ "grad_norm": 4.284384727478027,
4850
+ "learning_rate": 6.800000000000001e-06,
4851
+ "loss": 0.699,
4852
+ "step": 6770
4853
+ },
4854
+ {
4855
+ "epoch": 0.17,
4856
+ "grad_norm": 5.010688781738281,
4857
+ "learning_rate": 6.778947368421053e-06,
4858
+ "loss": 0.7803,
4859
+ "step": 6780
4860
+ },
4861
+ {
4862
+ "epoch": 0.17,
4863
+ "grad_norm": 2.5098397731781006,
4864
+ "learning_rate": 6.7578947368421054e-06,
4865
+ "loss": 0.767,
4866
+ "step": 6790
4867
+ },
4868
+ {
4869
+ "epoch": 0.17,
4870
+ "grad_norm": 2.8980441093444824,
4871
+ "learning_rate": 6.736842105263158e-06,
4872
+ "loss": 0.8084,
4873
+ "step": 6800
4874
+ },
4875
+ {
4876
+ "epoch": 0.17,
4877
+ "grad_norm": 3.8058199882507324,
4878
+ "learning_rate": 6.715789473684211e-06,
4879
+ "loss": 0.7214,
4880
+ "step": 6810
4881
+ },
4882
+ {
4883
+ "epoch": 0.17,
4884
+ "grad_norm": 2.3668529987335205,
4885
+ "learning_rate": 6.694736842105264e-06,
4886
+ "loss": 0.6759,
4887
+ "step": 6820
4888
+ },
4889
+ {
4890
+ "epoch": 0.17,
4891
+ "grad_norm": 5.715735912322998,
4892
+ "learning_rate": 6.673684210526317e-06,
4893
+ "loss": 0.7747,
4894
+ "step": 6830
4895
+ },
4896
+ {
4897
+ "epoch": 0.17,
4898
+ "grad_norm": 8.902985572814941,
4899
+ "learning_rate": 6.6526315789473695e-06,
4900
+ "loss": 0.8256,
4901
+ "step": 6840
4902
+ },
4903
+ {
4904
+ "epoch": 0.17,
4905
+ "grad_norm": 5.802920818328857,
4906
+ "learning_rate": 6.631578947368421e-06,
4907
+ "loss": 0.7682,
4908
+ "step": 6850
4909
+ },
4910
+ {
4911
+ "epoch": 0.17,
4912
+ "grad_norm": 9.218498229980469,
4913
+ "learning_rate": 6.610526315789474e-06,
4914
+ "loss": 0.7855,
4915
+ "step": 6860
4916
+ },
4917
+ {
4918
+ "epoch": 0.17,
4919
+ "grad_norm": 4.406294822692871,
4920
+ "learning_rate": 6.589473684210527e-06,
4921
+ "loss": 0.736,
4922
+ "step": 6870
4923
+ },
4924
+ {
4925
+ "epoch": 0.17,
4926
+ "grad_norm": 5.765889644622803,
4927
+ "learning_rate": 6.568421052631579e-06,
4928
+ "loss": 0.7073,
4929
+ "step": 6880
4930
+ },
4931
+ {
4932
+ "epoch": 0.17,
4933
+ "grad_norm": 2.910264015197754,
4934
+ "learning_rate": 6.547368421052632e-06,
4935
+ "loss": 0.7328,
4936
+ "step": 6890
4937
+ },
4938
+ {
4939
+ "epoch": 0.17,
4940
+ "grad_norm": 9.011739730834961,
4941
+ "learning_rate": 6.526315789473685e-06,
4942
+ "loss": 0.6798,
4943
+ "step": 6900
4944
+ },
4945
+ {
4946
+ "epoch": 0.17,
4947
+ "grad_norm": 8.296028137207031,
4948
+ "learning_rate": 6.505263157894738e-06,
4949
+ "loss": 0.7469,
4950
+ "step": 6910
4951
+ },
4952
+ {
4953
+ "epoch": 0.17,
4954
+ "grad_norm": 5.347682952880859,
4955
+ "learning_rate": 6.484210526315789e-06,
4956
+ "loss": 0.7143,
4957
+ "step": 6920
4958
+ },
4959
+ {
4960
+ "epoch": 0.17,
4961
+ "grad_norm": 5.903685092926025,
4962
+ "learning_rate": 6.463157894736843e-06,
4963
+ "loss": 0.7413,
4964
+ "step": 6930
4965
+ },
4966
+ {
4967
+ "epoch": 0.17,
4968
+ "grad_norm": 4.017665386199951,
4969
+ "learning_rate": 6.442105263157895e-06,
4970
+ "loss": 0.7569,
4971
+ "step": 6940
4972
+ },
4973
+ {
4974
+ "epoch": 0.17,
4975
+ "grad_norm": 2.3947088718414307,
4976
+ "learning_rate": 6.421052631578948e-06,
4977
+ "loss": 0.75,
4978
+ "step": 6950
4979
+ },
4980
+ {
4981
+ "epoch": 0.17,
4982
+ "grad_norm": 4.019251823425293,
4983
+ "learning_rate": 6.4000000000000006e-06,
4984
+ "loss": 0.7364,
4985
+ "step": 6960
4986
+ },
4987
+ {
4988
+ "epoch": 0.17,
4989
+ "grad_norm": 2.439628839492798,
4990
+ "learning_rate": 6.378947368421053e-06,
4991
+ "loss": 0.68,
4992
+ "step": 6970
4993
+ },
4994
+ {
4995
+ "epoch": 0.17,
4996
+ "grad_norm": 2.413942575454712,
4997
+ "learning_rate": 6.357894736842106e-06,
4998
+ "loss": 0.79,
4999
+ "step": 6980
5000
+ },
5001
+ {
5002
+ "epoch": 0.17,
5003
+ "grad_norm": 8.72237491607666,
5004
+ "learning_rate": 6.336842105263158e-06,
5005
+ "loss": 0.6678,
5006
+ "step": 6990
5007
+ },
5008
+ {
5009
+ "epoch": 0.17,
5010
+ "grad_norm": 3.9021055698394775,
5011
+ "learning_rate": 6.31578947368421e-06,
5012
+ "loss": 0.7169,
5013
+ "step": 7000
5014
+ },
5015
+ {
5016
+ "epoch": 0.17,
5017
+ "eval_loss": 0.7889605164527893,
5018
+ "eval_runtime": 67.8704,
5019
+ "eval_samples_per_second": 14.734,
5020
+ "eval_steps_per_second": 14.734,
5021
+ "step": 7000
5022
+ },
5023
+ {
5024
+ "epoch": 0.18,
5025
+ "grad_norm": 8.238909721374512,
5026
+ "learning_rate": 6.294736842105264e-06,
5027
+ "loss": 0.658,
5028
+ "step": 7010
5029
+ },
5030
+ {
5031
+ "epoch": 0.18,
5032
+ "grad_norm": 3.403461456298828,
5033
+ "learning_rate": 6.2736842105263165e-06,
5034
+ "loss": 0.8165,
5035
+ "step": 7020
5036
+ },
5037
+ {
5038
+ "epoch": 0.18,
5039
+ "grad_norm": 5.648688316345215,
5040
+ "learning_rate": 6.252631578947369e-06,
5041
+ "loss": 0.7506,
5042
+ "step": 7030
5043
+ },
5044
+ {
5045
+ "epoch": 0.18,
5046
+ "grad_norm": 2.380591630935669,
5047
+ "learning_rate": 6.231578947368422e-06,
5048
+ "loss": 0.8892,
5049
+ "step": 7040
5050
+ },
5051
+ {
5052
+ "epoch": 0.18,
5053
+ "grad_norm": 4.201750755310059,
5054
+ "learning_rate": 6.2105263157894745e-06,
5055
+ "loss": 0.7069,
5056
+ "step": 7050
5057
+ },
5058
+ {
5059
+ "epoch": 0.18,
5060
+ "grad_norm": 2.9994821548461914,
5061
+ "learning_rate": 6.189473684210526e-06,
5062
+ "loss": 0.6896,
5063
+ "step": 7060
5064
+ },
5065
+ {
5066
+ "epoch": 0.18,
5067
+ "grad_norm": 5.100094318389893,
5068
+ "learning_rate": 6.168421052631579e-06,
5069
+ "loss": 0.6241,
5070
+ "step": 7070
5071
+ },
5072
+ {
5073
+ "epoch": 0.18,
5074
+ "grad_norm": 3.88962721824646,
5075
+ "learning_rate": 6.1473684210526316e-06,
5076
+ "loss": 0.741,
5077
+ "step": 7080
5078
+ },
5079
+ {
5080
+ "epoch": 0.18,
5081
+ "grad_norm": 3.669283151626587,
5082
+ "learning_rate": 6.126315789473685e-06,
5083
+ "loss": 0.5153,
5084
+ "step": 7090
5085
+ },
5086
+ {
5087
+ "epoch": 0.18,
5088
+ "grad_norm": 6.010345458984375,
5089
+ "learning_rate": 6.105263157894738e-06,
5090
+ "loss": 0.7394,
5091
+ "step": 7100
5092
+ },
5093
+ {
5094
+ "epoch": 0.18,
5095
+ "grad_norm": 5.333982467651367,
5096
+ "learning_rate": 6.08421052631579e-06,
5097
+ "loss": 0.6423,
5098
+ "step": 7110
5099
+ },
5100
+ {
5101
+ "epoch": 0.18,
5102
+ "grad_norm": 2.0060064792633057,
5103
+ "learning_rate": 6.063157894736843e-06,
5104
+ "loss": 0.7073,
5105
+ "step": 7120
5106
+ },
5107
+ {
5108
+ "epoch": 0.18,
5109
+ "grad_norm": 3.618821144104004,
5110
+ "learning_rate": 6.042105263157895e-06,
5111
+ "loss": 0.7221,
5112
+ "step": 7130
5113
+ },
5114
+ {
5115
+ "epoch": 0.18,
5116
+ "grad_norm": 2.6231422424316406,
5117
+ "learning_rate": 6.0210526315789475e-06,
5118
+ "loss": 0.6748,
5119
+ "step": 7140
5120
+ },
5121
+ {
5122
+ "epoch": 0.18,
5123
+ "grad_norm": 7.207015514373779,
5124
+ "learning_rate": 6e-06,
5125
+ "loss": 0.7403,
5126
+ "step": 7150
5127
+ },
5128
+ {
5129
+ "epoch": 0.18,
5130
+ "grad_norm": 5.1877031326293945,
5131
+ "learning_rate": 5.978947368421053e-06,
5132
+ "loss": 0.6143,
5133
+ "step": 7160
5134
+ },
5135
+ {
5136
+ "epoch": 0.18,
5137
+ "grad_norm": 3.433973550796509,
5138
+ "learning_rate": 5.9578947368421055e-06,
5139
+ "loss": 0.6593,
5140
+ "step": 7170
5141
+ },
5142
+ {
5143
+ "epoch": 0.18,
5144
+ "grad_norm": 4.261890888214111,
5145
+ "learning_rate": 5.936842105263159e-06,
5146
+ "loss": 0.7119,
5147
+ "step": 7180
5148
+ },
5149
+ {
5150
+ "epoch": 0.18,
5151
+ "grad_norm": 2.4731180667877197,
5152
+ "learning_rate": 5.915789473684212e-06,
5153
+ "loss": 0.7764,
5154
+ "step": 7190
5155
+ },
5156
+ {
5157
+ "epoch": 0.18,
5158
+ "grad_norm": 3.540252923965454,
5159
+ "learning_rate": 5.8947368421052634e-06,
5160
+ "loss": 0.788,
5161
+ "step": 7200
5162
+ },
5163
+ {
5164
+ "epoch": 0.18,
5165
+ "grad_norm": 16.481884002685547,
5166
+ "learning_rate": 5.873684210526316e-06,
5167
+ "loss": 0.7411,
5168
+ "step": 7210
5169
+ },
5170
+ {
5171
+ "epoch": 0.18,
5172
+ "grad_norm": 5.3406548500061035,
5173
+ "learning_rate": 5.852631578947369e-06,
5174
+ "loss": 0.7703,
5175
+ "step": 7220
5176
+ },
5177
+ {
5178
+ "epoch": 0.18,
5179
+ "grad_norm": 5.786658763885498,
5180
+ "learning_rate": 5.831578947368421e-06,
5181
+ "loss": 0.7068,
5182
+ "step": 7230
5183
+ },
5184
+ {
5185
+ "epoch": 0.18,
5186
+ "grad_norm": 6.659720420837402,
5187
+ "learning_rate": 5.810526315789474e-06,
5188
+ "loss": 0.7287,
5189
+ "step": 7240
5190
+ },
5191
+ {
5192
+ "epoch": 0.18,
5193
+ "grad_norm": 2.9273788928985596,
5194
+ "learning_rate": 5.789473684210527e-06,
5195
+ "loss": 0.7059,
5196
+ "step": 7250
5197
+ },
5198
+ {
5199
+ "epoch": 0.18,
5200
+ "grad_norm": 5.475671768188477,
5201
+ "learning_rate": 5.76842105263158e-06,
5202
+ "loss": 0.7284,
5203
+ "step": 7260
5204
+ },
5205
+ {
5206
+ "epoch": 0.18,
5207
+ "grad_norm": 5.699868202209473,
5208
+ "learning_rate": 5.747368421052633e-06,
5209
+ "loss": 0.8036,
5210
+ "step": 7270
5211
+ },
5212
+ {
5213
+ "epoch": 0.18,
5214
+ "grad_norm": 3.518573045730591,
5215
+ "learning_rate": 5.726315789473685e-06,
5216
+ "loss": 0.7209,
5217
+ "step": 7280
5218
+ },
5219
+ {
5220
+ "epoch": 0.18,
5221
+ "grad_norm": 8.151999473571777,
5222
+ "learning_rate": 5.705263157894737e-06,
5223
+ "loss": 0.6903,
5224
+ "step": 7290
5225
+ },
5226
+ {
5227
+ "epoch": 0.18,
5228
+ "grad_norm": 4.088874340057373,
5229
+ "learning_rate": 5.68421052631579e-06,
5230
+ "loss": 0.7685,
5231
+ "step": 7300
5232
+ },
5233
+ {
5234
+ "epoch": 0.18,
5235
+ "grad_norm": 9.118200302124023,
5236
+ "learning_rate": 5.663157894736843e-06,
5237
+ "loss": 0.7256,
5238
+ "step": 7310
5239
+ },
5240
+ {
5241
+ "epoch": 0.18,
5242
+ "grad_norm": 6.765544414520264,
5243
+ "learning_rate": 5.642105263157895e-06,
5244
+ "loss": 0.8016,
5245
+ "step": 7320
5246
+ },
5247
+ {
5248
+ "epoch": 0.18,
5249
+ "grad_norm": 11.424837112426758,
5250
+ "learning_rate": 5.621052631578948e-06,
5251
+ "loss": 0.7721,
5252
+ "step": 7330
5253
+ },
5254
+ {
5255
+ "epoch": 0.18,
5256
+ "grad_norm": 5.862210750579834,
5257
+ "learning_rate": 5.600000000000001e-06,
5258
+ "loss": 0.6898,
5259
+ "step": 7340
5260
+ },
5261
+ {
5262
+ "epoch": 0.18,
5263
+ "grad_norm": 4.197153568267822,
5264
+ "learning_rate": 5.578947368421052e-06,
5265
+ "loss": 0.6907,
5266
+ "step": 7350
5267
+ },
5268
+ {
5269
+ "epoch": 0.18,
5270
+ "grad_norm": 6.712553977966309,
5271
+ "learning_rate": 5.557894736842105e-06,
5272
+ "loss": 0.773,
5273
+ "step": 7360
5274
+ },
5275
+ {
5276
+ "epoch": 0.18,
5277
+ "grad_norm": 4.968278408050537,
5278
+ "learning_rate": 5.5368421052631586e-06,
5279
+ "loss": 0.7892,
5280
+ "step": 7370
5281
+ },
5282
+ {
5283
+ "epoch": 0.18,
5284
+ "grad_norm": 3.8882153034210205,
5285
+ "learning_rate": 5.515789473684211e-06,
5286
+ "loss": 0.8365,
5287
+ "step": 7380
5288
+ },
5289
+ {
5290
+ "epoch": 0.18,
5291
+ "grad_norm": 6.6297197341918945,
5292
+ "learning_rate": 5.494736842105264e-06,
5293
+ "loss": 0.7374,
5294
+ "step": 7390
5295
+ },
5296
+ {
5297
+ "epoch": 0.18,
5298
+ "grad_norm": 2.2362327575683594,
5299
+ "learning_rate": 5.4736842105263165e-06,
5300
+ "loss": 0.8293,
5301
+ "step": 7400
5302
+ },
5303
+ {
5304
+ "epoch": 0.19,
5305
+ "grad_norm": 4.1008100509643555,
5306
+ "learning_rate": 5.452631578947369e-06,
5307
+ "loss": 0.7048,
5308
+ "step": 7410
5309
+ },
5310
+ {
5311
+ "epoch": 0.19,
5312
+ "grad_norm": 4.488921642303467,
5313
+ "learning_rate": 5.431578947368421e-06,
5314
+ "loss": 0.7902,
5315
+ "step": 7420
5316
+ },
5317
+ {
5318
+ "epoch": 0.19,
5319
+ "grad_norm": 3.7497622966766357,
5320
+ "learning_rate": 5.410526315789474e-06,
5321
+ "loss": 0.8359,
5322
+ "step": 7430
5323
+ },
5324
+ {
5325
+ "epoch": 0.19,
5326
+ "grad_norm": 3.192277193069458,
5327
+ "learning_rate": 5.389473684210526e-06,
5328
+ "loss": 0.7253,
5329
+ "step": 7440
5330
+ },
5331
+ {
5332
+ "epoch": 0.19,
5333
+ "grad_norm": 4.586243629455566,
5334
+ "learning_rate": 5.36842105263158e-06,
5335
+ "loss": 0.7588,
5336
+ "step": 7450
5337
+ },
5338
+ {
5339
+ "epoch": 0.19,
5340
+ "grad_norm": 3.424870729446411,
5341
+ "learning_rate": 5.3473684210526325e-06,
5342
+ "loss": 0.7268,
5343
+ "step": 7460
5344
+ },
5345
+ {
5346
+ "epoch": 0.19,
5347
+ "grad_norm": 28.807186126708984,
5348
+ "learning_rate": 5.326315789473685e-06,
5349
+ "loss": 0.7979,
5350
+ "step": 7470
5351
+ },
5352
+ {
5353
+ "epoch": 0.19,
5354
+ "grad_norm": 4.297823905944824,
5355
+ "learning_rate": 5.305263157894738e-06,
5356
+ "loss": 0.768,
5357
+ "step": 7480
5358
+ },
5359
+ {
5360
+ "epoch": 0.19,
5361
+ "grad_norm": 4.891976833343506,
5362
+ "learning_rate": 5.2842105263157896e-06,
5363
+ "loss": 0.7063,
5364
+ "step": 7490
5365
+ },
5366
+ {
5367
+ "epoch": 0.19,
5368
+ "grad_norm": 4.083632469177246,
5369
+ "learning_rate": 5.263157894736842e-06,
5370
+ "loss": 0.8102,
5371
+ "step": 7500
5372
+ },
5373
+ {
5374
+ "epoch": 0.19,
5375
+ "eval_loss": 0.7507393956184387,
5376
+ "eval_runtime": 67.8717,
5377
+ "eval_samples_per_second": 14.734,
5378
+ "eval_steps_per_second": 14.734,
5379
+ "step": 7500
5380
  }
5381
  ],
5382
  "logging_steps": 10,
 
5384
  "num_input_tokens_seen": 0,
5385
  "num_train_epochs": 1,
5386
  "save_steps": 2500,
5387
+ "total_flos": 1.2076594495488e+17,
5388
  "train_batch_size": 1,
5389
  "trial_name": null,
5390
  "trial_params": null