Phi-4-Erebus-RP / checkpoint-531 /trainer_state.json
NyxKrage's picture
Training in progress, step 531, checkpoint
2956d06 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.24991175432403812,
"eval_steps": 500,
"global_step": 531,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004706436051300153,
"grad_norm": 0.6595008969306946,
"learning_rate": 2.5000000000000004e-07,
"loss": 9.6515,
"step": 1
},
{
"epoch": 0.0009412872102600306,
"grad_norm": 0.7391405701637268,
"learning_rate": 5.000000000000001e-07,
"loss": 9.5434,
"step": 2
},
{
"epoch": 0.0014119308153900459,
"grad_norm": 0.8721428513526917,
"learning_rate": 7.5e-07,
"loss": 9.0645,
"step": 3
},
{
"epoch": 0.0018825744205200612,
"grad_norm": 0.9540417790412903,
"learning_rate": 1.0000000000000002e-06,
"loss": 9.0978,
"step": 4
},
{
"epoch": 0.0023532180256500765,
"grad_norm": 1.0068703889846802,
"learning_rate": 1.25e-06,
"loss": 8.8096,
"step": 5
},
{
"epoch": 0.0028238616307800918,
"grad_norm": 0.7046281695365906,
"learning_rate": 1.5e-06,
"loss": 9.5863,
"step": 6
},
{
"epoch": 0.003294505235910107,
"grad_norm": 1.027761459350586,
"learning_rate": 1.7500000000000002e-06,
"loss": 9.3746,
"step": 7
},
{
"epoch": 0.0037651488410401224,
"grad_norm": 0.7785173058509827,
"learning_rate": 2.0000000000000003e-06,
"loss": 9.1443,
"step": 8
},
{
"epoch": 0.004235792446170138,
"grad_norm": 0.8485608696937561,
"learning_rate": 2.25e-06,
"loss": 9.2293,
"step": 9
},
{
"epoch": 0.004706436051300153,
"grad_norm": 0.8275871872901917,
"learning_rate": 2.5e-06,
"loss": 8.7838,
"step": 10
},
{
"epoch": 0.005177079656430168,
"grad_norm": 0.5895422101020813,
"learning_rate": 2.7500000000000004e-06,
"loss": 9.5706,
"step": 11
},
{
"epoch": 0.0056477232615601836,
"grad_norm": 0.9113247394561768,
"learning_rate": 3e-06,
"loss": 8.9198,
"step": 12
},
{
"epoch": 0.006118366866690199,
"grad_norm": 0.7459664940834045,
"learning_rate": 3.2500000000000002e-06,
"loss": 9.2372,
"step": 13
},
{
"epoch": 0.006589010471820214,
"grad_norm": 0.6556370854377747,
"learning_rate": 3.5000000000000004e-06,
"loss": 9.1809,
"step": 14
},
{
"epoch": 0.0070596540769502295,
"grad_norm": 0.719078540802002,
"learning_rate": 3.75e-06,
"loss": 9.2422,
"step": 15
},
{
"epoch": 0.007530297682080245,
"grad_norm": 0.8138344287872314,
"learning_rate": 4.000000000000001e-06,
"loss": 9.271,
"step": 16
},
{
"epoch": 0.00800094128721026,
"grad_norm": 0.7246189713478088,
"learning_rate": 4.250000000000001e-06,
"loss": 9.5405,
"step": 17
},
{
"epoch": 0.008471584892340275,
"grad_norm": 0.8132815361022949,
"learning_rate": 4.5e-06,
"loss": 9.7983,
"step": 18
},
{
"epoch": 0.00894222849747029,
"grad_norm": 0.5946951508522034,
"learning_rate": 4.75e-06,
"loss": 9.733,
"step": 19
},
{
"epoch": 0.009412872102600306,
"grad_norm": 0.5157704949378967,
"learning_rate": 5e-06,
"loss": 9.6086,
"step": 20
},
{
"epoch": 0.009883515707730321,
"grad_norm": 0.5629891157150269,
"learning_rate": 5.25e-06,
"loss": 9.2102,
"step": 21
},
{
"epoch": 0.010354159312860337,
"grad_norm": 0.48590287566185,
"learning_rate": 5.500000000000001e-06,
"loss": 9.7732,
"step": 22
},
{
"epoch": 0.010824802917990352,
"grad_norm": 0.5960127711296082,
"learning_rate": 5.750000000000001e-06,
"loss": 9.3421,
"step": 23
},
{
"epoch": 0.011295446523120367,
"grad_norm": 0.48235076665878296,
"learning_rate": 6e-06,
"loss": 9.5374,
"step": 24
},
{
"epoch": 0.011766090128250382,
"grad_norm": 0.4856416881084442,
"learning_rate": 6.25e-06,
"loss": 9.2162,
"step": 25
},
{
"epoch": 0.012236733733380398,
"grad_norm": 0.45604783296585083,
"learning_rate": 6.5000000000000004e-06,
"loss": 9.3802,
"step": 26
},
{
"epoch": 0.012707377338510413,
"grad_norm": 0.4940997064113617,
"learning_rate": 6.750000000000001e-06,
"loss": 8.9352,
"step": 27
},
{
"epoch": 0.013178020943640428,
"grad_norm": 0.5067102909088135,
"learning_rate": 7.000000000000001e-06,
"loss": 9.6871,
"step": 28
},
{
"epoch": 0.013648664548770444,
"grad_norm": 0.5070438385009766,
"learning_rate": 7.25e-06,
"loss": 9.1244,
"step": 29
},
{
"epoch": 0.014119308153900459,
"grad_norm": 0.47256559133529663,
"learning_rate": 7.5e-06,
"loss": 9.6139,
"step": 30
},
{
"epoch": 0.014589951759030474,
"grad_norm": 0.6668869853019714,
"learning_rate": 7.75e-06,
"loss": 8.9173,
"step": 31
},
{
"epoch": 0.01506059536416049,
"grad_norm": 0.7926103472709656,
"learning_rate": 8.000000000000001e-06,
"loss": 8.8604,
"step": 32
},
{
"epoch": 0.015531238969290505,
"grad_norm": 0.4389215409755707,
"learning_rate": 8.25e-06,
"loss": 9.42,
"step": 33
},
{
"epoch": 0.01600188257442052,
"grad_norm": 0.527125895023346,
"learning_rate": 8.500000000000002e-06,
"loss": 9.5552,
"step": 34
},
{
"epoch": 0.016472526179550535,
"grad_norm": 0.5376142263412476,
"learning_rate": 8.75e-06,
"loss": 9.1412,
"step": 35
},
{
"epoch": 0.01694316978468055,
"grad_norm": 0.4762144386768341,
"learning_rate": 9e-06,
"loss": 9.2153,
"step": 36
},
{
"epoch": 0.017413813389810566,
"grad_norm": 0.46567338705062866,
"learning_rate": 9.25e-06,
"loss": 9.3836,
"step": 37
},
{
"epoch": 0.01788445699494058,
"grad_norm": 0.4322827458381653,
"learning_rate": 9.5e-06,
"loss": 8.9984,
"step": 38
},
{
"epoch": 0.018355100600070597,
"grad_norm": 0.42570286989212036,
"learning_rate": 9.750000000000002e-06,
"loss": 9.0916,
"step": 39
},
{
"epoch": 0.018825744205200612,
"grad_norm": 0.43363815546035767,
"learning_rate": 1e-05,
"loss": 9.0663,
"step": 40
},
{
"epoch": 0.019296387810330627,
"grad_norm": 0.3969482481479645,
"learning_rate": 1.025e-05,
"loss": 9.4064,
"step": 41
},
{
"epoch": 0.019767031415460642,
"grad_norm": 0.4335750639438629,
"learning_rate": 1.05e-05,
"loss": 9.262,
"step": 42
},
{
"epoch": 0.020237675020590658,
"grad_norm": 0.4210178852081299,
"learning_rate": 1.075e-05,
"loss": 9.4898,
"step": 43
},
{
"epoch": 0.020708318625720673,
"grad_norm": 0.39311668276786804,
"learning_rate": 1.1000000000000001e-05,
"loss": 9.7063,
"step": 44
},
{
"epoch": 0.02117896223085069,
"grad_norm": 0.39521753787994385,
"learning_rate": 1.125e-05,
"loss": 9.3065,
"step": 45
},
{
"epoch": 0.021649605835980704,
"grad_norm": 0.42978909611701965,
"learning_rate": 1.1500000000000002e-05,
"loss": 8.9722,
"step": 46
},
{
"epoch": 0.02212024944111072,
"grad_norm": 0.47351160645484924,
"learning_rate": 1.175e-05,
"loss": 8.9028,
"step": 47
},
{
"epoch": 0.022590893046240734,
"grad_norm": 0.4192260801792145,
"learning_rate": 1.2e-05,
"loss": 8.913,
"step": 48
},
{
"epoch": 0.02306153665137075,
"grad_norm": 0.42306703329086304,
"learning_rate": 1.225e-05,
"loss": 9.3223,
"step": 49
},
{
"epoch": 0.023532180256500765,
"grad_norm": 0.40158239006996155,
"learning_rate": 1.25e-05,
"loss": 9.5922,
"step": 50
},
{
"epoch": 0.02400282386163078,
"grad_norm": 0.5165021419525146,
"learning_rate": 1.2750000000000002e-05,
"loss": 9.24,
"step": 51
},
{
"epoch": 0.024473467466760795,
"grad_norm": 0.3930136263370514,
"learning_rate": 1.3000000000000001e-05,
"loss": 8.7955,
"step": 52
},
{
"epoch": 0.02494411107189081,
"grad_norm": 0.3975488543510437,
"learning_rate": 1.3250000000000002e-05,
"loss": 8.7474,
"step": 53
},
{
"epoch": 0.025414754677020826,
"grad_norm": 0.46201732754707336,
"learning_rate": 1.3500000000000001e-05,
"loss": 9.1239,
"step": 54
},
{
"epoch": 0.02588539828215084,
"grad_norm": 0.42599615454673767,
"learning_rate": 1.3750000000000002e-05,
"loss": 9.2889,
"step": 55
},
{
"epoch": 0.026356041887280857,
"grad_norm": 0.3889259994029999,
"learning_rate": 1.4000000000000001e-05,
"loss": 9.5315,
"step": 56
},
{
"epoch": 0.026826685492410872,
"grad_norm": 0.3762259781360626,
"learning_rate": 1.4249999999999999e-05,
"loss": 9.3968,
"step": 57
},
{
"epoch": 0.027297329097540887,
"grad_norm": 0.4486519396305084,
"learning_rate": 1.45e-05,
"loss": 9.2345,
"step": 58
},
{
"epoch": 0.027767972702670903,
"grad_norm": 0.43613263964653015,
"learning_rate": 1.475e-05,
"loss": 9.293,
"step": 59
},
{
"epoch": 0.028238616307800918,
"grad_norm": 0.40770891308784485,
"learning_rate": 1.5e-05,
"loss": 8.9544,
"step": 60
},
{
"epoch": 0.028709259912930933,
"grad_norm": 0.36603429913520813,
"learning_rate": 1.525e-05,
"loss": 9.5768,
"step": 61
},
{
"epoch": 0.02917990351806095,
"grad_norm": 0.41165047883987427,
"learning_rate": 1.55e-05,
"loss": 9.0203,
"step": 62
},
{
"epoch": 0.029650547123190964,
"grad_norm": 0.4514125883579254,
"learning_rate": 1.575e-05,
"loss": 9.2653,
"step": 63
},
{
"epoch": 0.03012119072832098,
"grad_norm": 0.41333243250846863,
"learning_rate": 1.6000000000000003e-05,
"loss": 8.9577,
"step": 64
},
{
"epoch": 0.030591834333450994,
"grad_norm": 0.42950087785720825,
"learning_rate": 1.6250000000000002e-05,
"loss": 9.341,
"step": 65
},
{
"epoch": 0.03106247793858101,
"grad_norm": 0.4158640205860138,
"learning_rate": 1.65e-05,
"loss": 9.6118,
"step": 66
},
{
"epoch": 0.031533121543711025,
"grad_norm": 0.39954355359077454,
"learning_rate": 1.675e-05,
"loss": 9.0818,
"step": 67
},
{
"epoch": 0.03200376514884104,
"grad_norm": 0.38233450055122375,
"learning_rate": 1.7000000000000003e-05,
"loss": 9.3953,
"step": 68
},
{
"epoch": 0.032474408753971055,
"grad_norm": 0.37950408458709717,
"learning_rate": 1.725e-05,
"loss": 9.3594,
"step": 69
},
{
"epoch": 0.03294505235910107,
"grad_norm": 0.475953608751297,
"learning_rate": 1.75e-05,
"loss": 9.0956,
"step": 70
},
{
"epoch": 0.033415695964231086,
"grad_norm": 0.4252181947231293,
"learning_rate": 1.775e-05,
"loss": 9.1928,
"step": 71
},
{
"epoch": 0.0338863395693611,
"grad_norm": 0.3946019411087036,
"learning_rate": 1.8e-05,
"loss": 9.1933,
"step": 72
},
{
"epoch": 0.03435698317449112,
"grad_norm": 0.4342809021472931,
"learning_rate": 1.825e-05,
"loss": 9.2859,
"step": 73
},
{
"epoch": 0.03482762677962113,
"grad_norm": 0.3921419084072113,
"learning_rate": 1.85e-05,
"loss": 9.1214,
"step": 74
},
{
"epoch": 0.03529827038475115,
"grad_norm": 0.3992595374584198,
"learning_rate": 1.8750000000000002e-05,
"loss": 9.332,
"step": 75
},
{
"epoch": 0.03576891398988116,
"grad_norm": 0.40269696712493896,
"learning_rate": 1.9e-05,
"loss": 9.4244,
"step": 76
},
{
"epoch": 0.03623955759501118,
"grad_norm": 0.41852205991744995,
"learning_rate": 1.925e-05,
"loss": 9.3765,
"step": 77
},
{
"epoch": 0.03671020120014119,
"grad_norm": 0.5162649750709534,
"learning_rate": 1.9500000000000003e-05,
"loss": 8.3471,
"step": 78
},
{
"epoch": 0.03718084480527121,
"grad_norm": 0.4802299737930298,
"learning_rate": 1.9750000000000002e-05,
"loss": 9.3251,
"step": 79
},
{
"epoch": 0.037651488410401224,
"grad_norm": 0.4261873960494995,
"learning_rate": 2e-05,
"loss": 9.5181,
"step": 80
},
{
"epoch": 0.03812213201553124,
"grad_norm": 0.4193435311317444,
"learning_rate": 2.025e-05,
"loss": 9.4217,
"step": 81
},
{
"epoch": 0.038592775620661254,
"grad_norm": 0.4148464798927307,
"learning_rate": 2.05e-05,
"loss": 8.7618,
"step": 82
},
{
"epoch": 0.03906341922579127,
"grad_norm": 0.4396406412124634,
"learning_rate": 2.075e-05,
"loss": 9.4059,
"step": 83
},
{
"epoch": 0.039534062830921285,
"grad_norm": 0.43215858936309814,
"learning_rate": 2.1e-05,
"loss": 9.0061,
"step": 84
},
{
"epoch": 0.0400047064360513,
"grad_norm": 0.4347785711288452,
"learning_rate": 2.125e-05,
"loss": 8.5384,
"step": 85
},
{
"epoch": 0.040475350041181316,
"grad_norm": 0.47068068385124207,
"learning_rate": 2.15e-05,
"loss": 9.2299,
"step": 86
},
{
"epoch": 0.04094599364631133,
"grad_norm": 0.44863706827163696,
"learning_rate": 2.175e-05,
"loss": 8.7932,
"step": 87
},
{
"epoch": 0.041416637251441346,
"grad_norm": 0.4525277316570282,
"learning_rate": 2.2000000000000003e-05,
"loss": 9.1699,
"step": 88
},
{
"epoch": 0.04188728085657136,
"grad_norm": 0.41207849979400635,
"learning_rate": 2.2250000000000002e-05,
"loss": 9.4979,
"step": 89
},
{
"epoch": 0.04235792446170138,
"grad_norm": 0.4179534912109375,
"learning_rate": 2.25e-05,
"loss": 9.1519,
"step": 90
},
{
"epoch": 0.04282856806683139,
"grad_norm": 0.472789466381073,
"learning_rate": 2.275e-05,
"loss": 9.1048,
"step": 91
},
{
"epoch": 0.04329921167196141,
"grad_norm": 0.44435739517211914,
"learning_rate": 2.3000000000000003e-05,
"loss": 9.2816,
"step": 92
},
{
"epoch": 0.04376985527709142,
"grad_norm": 0.41012299060821533,
"learning_rate": 2.3250000000000003e-05,
"loss": 9.4546,
"step": 93
},
{
"epoch": 0.04424049888222144,
"grad_norm": 0.4100490212440491,
"learning_rate": 2.35e-05,
"loss": 9.4397,
"step": 94
},
{
"epoch": 0.04471114248735145,
"grad_norm": 0.4229314923286438,
"learning_rate": 2.375e-05,
"loss": 8.9033,
"step": 95
},
{
"epoch": 0.04518178609248147,
"grad_norm": 0.39841172099113464,
"learning_rate": 2.4e-05,
"loss": 9.3391,
"step": 96
},
{
"epoch": 0.045652429697611484,
"grad_norm": 0.4041540324687958,
"learning_rate": 2.425e-05,
"loss": 9.3347,
"step": 97
},
{
"epoch": 0.0461230733027415,
"grad_norm": 0.4046013653278351,
"learning_rate": 2.45e-05,
"loss": 9.4645,
"step": 98
},
{
"epoch": 0.046593716907871514,
"grad_norm": 0.3989504277706146,
"learning_rate": 2.4750000000000002e-05,
"loss": 9.2343,
"step": 99
},
{
"epoch": 0.04706436051300153,
"grad_norm": 0.41768062114715576,
"learning_rate": 2.5e-05,
"loss": 9.6114,
"step": 100
},
{
"epoch": 0.047535004118131545,
"grad_norm": 0.4360901713371277,
"learning_rate": 2.525e-05,
"loss": 9.3584,
"step": 101
},
{
"epoch": 0.04800564772326156,
"grad_norm": 0.5093626976013184,
"learning_rate": 2.5500000000000003e-05,
"loss": 9.3969,
"step": 102
},
{
"epoch": 0.048476291328391576,
"grad_norm": 0.5148160457611084,
"learning_rate": 2.5750000000000002e-05,
"loss": 9.3607,
"step": 103
},
{
"epoch": 0.04894693493352159,
"grad_norm": 0.4556065797805786,
"learning_rate": 2.6000000000000002e-05,
"loss": 8.6494,
"step": 104
},
{
"epoch": 0.049417578538651606,
"grad_norm": 0.48136287927627563,
"learning_rate": 2.625e-05,
"loss": 8.8816,
"step": 105
},
{
"epoch": 0.04988822214378162,
"grad_norm": 0.4007977247238159,
"learning_rate": 2.6500000000000004e-05,
"loss": 9.0173,
"step": 106
},
{
"epoch": 0.05035886574891164,
"grad_norm": 0.5088827610015869,
"learning_rate": 2.6750000000000003e-05,
"loss": 9.4898,
"step": 107
},
{
"epoch": 0.05082950935404165,
"grad_norm": 0.4222247898578644,
"learning_rate": 2.7000000000000002e-05,
"loss": 9.5039,
"step": 108
},
{
"epoch": 0.05130015295917167,
"grad_norm": 0.42676958441734314,
"learning_rate": 2.725e-05,
"loss": 9.3007,
"step": 109
},
{
"epoch": 0.05177079656430168,
"grad_norm": 0.4315201938152313,
"learning_rate": 2.7500000000000004e-05,
"loss": 9.1473,
"step": 110
},
{
"epoch": 0.0522414401694317,
"grad_norm": 0.5586130619049072,
"learning_rate": 2.7750000000000004e-05,
"loss": 9.486,
"step": 111
},
{
"epoch": 0.05271208377456171,
"grad_norm": 0.4153185486793518,
"learning_rate": 2.8000000000000003e-05,
"loss": 9.2632,
"step": 112
},
{
"epoch": 0.05318272737969173,
"grad_norm": 0.47736650705337524,
"learning_rate": 2.825e-05,
"loss": 8.9582,
"step": 113
},
{
"epoch": 0.053653370984821744,
"grad_norm": 0.4127710163593292,
"learning_rate": 2.8499999999999998e-05,
"loss": 9.3019,
"step": 114
},
{
"epoch": 0.05412401458995176,
"grad_norm": 0.44509121775627136,
"learning_rate": 2.8749999999999997e-05,
"loss": 9.1081,
"step": 115
},
{
"epoch": 0.054594658195081774,
"grad_norm": 0.4519471526145935,
"learning_rate": 2.9e-05,
"loss": 9.4795,
"step": 116
},
{
"epoch": 0.05506530180021179,
"grad_norm": 0.4292161464691162,
"learning_rate": 2.925e-05,
"loss": 9.2027,
"step": 117
},
{
"epoch": 0.055535945405341805,
"grad_norm": 0.46465009450912476,
"learning_rate": 2.95e-05,
"loss": 9.081,
"step": 118
},
{
"epoch": 0.05600658901047182,
"grad_norm": 0.4395250976085663,
"learning_rate": 2.975e-05,
"loss": 9.4345,
"step": 119
},
{
"epoch": 0.056477232615601836,
"grad_norm": 0.4673008918762207,
"learning_rate": 3e-05,
"loss": 9.3435,
"step": 120
},
{
"epoch": 0.05694787622073185,
"grad_norm": 0.4328051209449768,
"learning_rate": 3.025e-05,
"loss": 8.7147,
"step": 121
},
{
"epoch": 0.057418519825861866,
"grad_norm": 0.444002240896225,
"learning_rate": 3.05e-05,
"loss": 8.8049,
"step": 122
},
{
"epoch": 0.05788916343099188,
"grad_norm": 0.4078370928764343,
"learning_rate": 3.075e-05,
"loss": 9.1032,
"step": 123
},
{
"epoch": 0.0583598070361219,
"grad_norm": 0.4445233941078186,
"learning_rate": 3.1e-05,
"loss": 9.279,
"step": 124
},
{
"epoch": 0.05883045064125191,
"grad_norm": 0.4282757639884949,
"learning_rate": 3.125e-05,
"loss": 9.4163,
"step": 125
},
{
"epoch": 0.05930109424638193,
"grad_norm": 0.41878628730773926,
"learning_rate": 3.15e-05,
"loss": 8.9876,
"step": 126
},
{
"epoch": 0.05977173785151194,
"grad_norm": 0.6357080340385437,
"learning_rate": 3.175e-05,
"loss": 8.4245,
"step": 127
},
{
"epoch": 0.06024238145664196,
"grad_norm": 0.4595104455947876,
"learning_rate": 3.2000000000000005e-05,
"loss": 9.1227,
"step": 128
},
{
"epoch": 0.06071302506177197,
"grad_norm": 1.0947221517562866,
"learning_rate": 3.2250000000000005e-05,
"loss": 8.6819,
"step": 129
},
{
"epoch": 0.06118366866690199,
"grad_norm": 0.43211594223976135,
"learning_rate": 3.2500000000000004e-05,
"loss": 9.1862,
"step": 130
},
{
"epoch": 0.061654312272032004,
"grad_norm": 0.4080043137073517,
"learning_rate": 3.275e-05,
"loss": 9.0489,
"step": 131
},
{
"epoch": 0.06212495587716202,
"grad_norm": 0.48265427350997925,
"learning_rate": 3.3e-05,
"loss": 9.257,
"step": 132
},
{
"epoch": 0.06259559948229203,
"grad_norm": 0.45756152272224426,
"learning_rate": 3.325e-05,
"loss": 8.9598,
"step": 133
},
{
"epoch": 0.06306624308742205,
"grad_norm": 0.3848661780357361,
"learning_rate": 3.35e-05,
"loss": 9.5542,
"step": 134
},
{
"epoch": 0.06353688669255206,
"grad_norm": 0.43142908811569214,
"learning_rate": 3.375000000000001e-05,
"loss": 9.0434,
"step": 135
},
{
"epoch": 0.06400753029768208,
"grad_norm": 0.39845573902130127,
"learning_rate": 3.4000000000000007e-05,
"loss": 9.7228,
"step": 136
},
{
"epoch": 0.06447817390281209,
"grad_norm": 0.4854653775691986,
"learning_rate": 3.4250000000000006e-05,
"loss": 8.9226,
"step": 137
},
{
"epoch": 0.06494881750794211,
"grad_norm": 0.41691291332244873,
"learning_rate": 3.45e-05,
"loss": 9.4588,
"step": 138
},
{
"epoch": 0.06541946111307212,
"grad_norm": 0.41709139943122864,
"learning_rate": 3.475e-05,
"loss": 8.9146,
"step": 139
},
{
"epoch": 0.06589010471820214,
"grad_norm": 0.3843998312950134,
"learning_rate": 3.5e-05,
"loss": 8.9889,
"step": 140
},
{
"epoch": 0.06636074832333215,
"grad_norm": 0.4418933391571045,
"learning_rate": 3.525e-05,
"loss": 9.3688,
"step": 141
},
{
"epoch": 0.06683139192846217,
"grad_norm": 0.3844826817512512,
"learning_rate": 3.55e-05,
"loss": 9.2518,
"step": 142
},
{
"epoch": 0.06730203553359218,
"grad_norm": 0.4951348900794983,
"learning_rate": 3.575e-05,
"loss": 8.9785,
"step": 143
},
{
"epoch": 0.0677726791387222,
"grad_norm": 0.475685179233551,
"learning_rate": 3.6e-05,
"loss": 9.0013,
"step": 144
},
{
"epoch": 0.06824332274385221,
"grad_norm": 0.5578158497810364,
"learning_rate": 3.625e-05,
"loss": 8.9177,
"step": 145
},
{
"epoch": 0.06871396634898223,
"grad_norm": 0.6955916881561279,
"learning_rate": 3.65e-05,
"loss": 8.9298,
"step": 146
},
{
"epoch": 0.06918460995411224,
"grad_norm": 0.4071875810623169,
"learning_rate": 3.675e-05,
"loss": 9.1422,
"step": 147
},
{
"epoch": 0.06965525355924226,
"grad_norm": 0.49543336033821106,
"learning_rate": 3.7e-05,
"loss": 9.4138,
"step": 148
},
{
"epoch": 0.07012589716437227,
"grad_norm": 0.4391457438468933,
"learning_rate": 3.7250000000000004e-05,
"loss": 9.3566,
"step": 149
},
{
"epoch": 0.0705965407695023,
"grad_norm": 0.4311358630657196,
"learning_rate": 3.7500000000000003e-05,
"loss": 8.6678,
"step": 150
},
{
"epoch": 0.0710671843746323,
"grad_norm": 0.4233754873275757,
"learning_rate": 3.775e-05,
"loss": 8.9541,
"step": 151
},
{
"epoch": 0.07153782797976233,
"grad_norm": 0.4653347432613373,
"learning_rate": 3.8e-05,
"loss": 8.953,
"step": 152
},
{
"epoch": 0.07200847158489233,
"grad_norm": 0.4828343689441681,
"learning_rate": 3.825e-05,
"loss": 8.9577,
"step": 153
},
{
"epoch": 0.07247911519002236,
"grad_norm": 0.43757960200309753,
"learning_rate": 3.85e-05,
"loss": 9.2349,
"step": 154
},
{
"epoch": 0.07294975879515236,
"grad_norm": 0.4094442129135132,
"learning_rate": 3.875e-05,
"loss": 9.424,
"step": 155
},
{
"epoch": 0.07342040240028239,
"grad_norm": 0.536808967590332,
"learning_rate": 3.9000000000000006e-05,
"loss": 8.9437,
"step": 156
},
{
"epoch": 0.0738910460054124,
"grad_norm": 0.4084169268608093,
"learning_rate": 3.9250000000000005e-05,
"loss": 9.5204,
"step": 157
},
{
"epoch": 0.07436168961054242,
"grad_norm": 0.4906410574913025,
"learning_rate": 3.9500000000000005e-05,
"loss": 9.0682,
"step": 158
},
{
"epoch": 0.07483233321567243,
"grad_norm": 0.42850637435913086,
"learning_rate": 3.9750000000000004e-05,
"loss": 9.0241,
"step": 159
},
{
"epoch": 0.07530297682080245,
"grad_norm": 0.3832900822162628,
"learning_rate": 4e-05,
"loss": 9.4956,
"step": 160
},
{
"epoch": 0.07577362042593246,
"grad_norm": 0.39132505655288696,
"learning_rate": 4.025e-05,
"loss": 9.4623,
"step": 161
},
{
"epoch": 0.07624426403106248,
"grad_norm": 0.44959893822669983,
"learning_rate": 4.05e-05,
"loss": 9.0518,
"step": 162
},
{
"epoch": 0.07671490763619249,
"grad_norm": 0.41552799940109253,
"learning_rate": 4.075e-05,
"loss": 9.1268,
"step": 163
},
{
"epoch": 0.07718555124132251,
"grad_norm": 0.42259296774864197,
"learning_rate": 4.1e-05,
"loss": 9.1533,
"step": 164
},
{
"epoch": 0.07765619484645252,
"grad_norm": 0.4441682994365692,
"learning_rate": 4.125e-05,
"loss": 8.7568,
"step": 165
},
{
"epoch": 0.07812683845158254,
"grad_norm": 0.42241615056991577,
"learning_rate": 4.15e-05,
"loss": 9.3366,
"step": 166
},
{
"epoch": 0.07859748205671255,
"grad_norm": 0.3997664153575897,
"learning_rate": 4.175e-05,
"loss": 8.855,
"step": 167
},
{
"epoch": 0.07906812566184257,
"grad_norm": 0.4293980300426483,
"learning_rate": 4.2e-05,
"loss": 8.9744,
"step": 168
},
{
"epoch": 0.07953876926697258,
"grad_norm": 0.4279899001121521,
"learning_rate": 4.2250000000000004e-05,
"loss": 9.0692,
"step": 169
},
{
"epoch": 0.0800094128721026,
"grad_norm": 0.4207955002784729,
"learning_rate": 4.25e-05,
"loss": 8.8506,
"step": 170
},
{
"epoch": 0.08048005647723261,
"grad_norm": 0.41057008504867554,
"learning_rate": 4.275e-05,
"loss": 9.2402,
"step": 171
},
{
"epoch": 0.08095070008236263,
"grad_norm": 0.4556719660758972,
"learning_rate": 4.3e-05,
"loss": 9.3806,
"step": 172
},
{
"epoch": 0.08142134368749264,
"grad_norm": 0.4468841850757599,
"learning_rate": 4.325e-05,
"loss": 9.0331,
"step": 173
},
{
"epoch": 0.08189198729262266,
"grad_norm": 0.4206986725330353,
"learning_rate": 4.35e-05,
"loss": 8.6767,
"step": 174
},
{
"epoch": 0.08236263089775267,
"grad_norm": 0.42576491832733154,
"learning_rate": 4.375e-05,
"loss": 8.7183,
"step": 175
},
{
"epoch": 0.08283327450288269,
"grad_norm": 0.4180700182914734,
"learning_rate": 4.4000000000000006e-05,
"loss": 8.8461,
"step": 176
},
{
"epoch": 0.0833039181080127,
"grad_norm": 0.3981553614139557,
"learning_rate": 4.4250000000000005e-05,
"loss": 8.9324,
"step": 177
},
{
"epoch": 0.08377456171314272,
"grad_norm": 0.4038431942462921,
"learning_rate": 4.4500000000000004e-05,
"loss": 8.7611,
"step": 178
},
{
"epoch": 0.08424520531827273,
"grad_norm": 0.4555639326572418,
"learning_rate": 4.4750000000000004e-05,
"loss": 8.4839,
"step": 179
},
{
"epoch": 0.08471584892340275,
"grad_norm": 0.39343494176864624,
"learning_rate": 4.5e-05,
"loss": 9.0263,
"step": 180
},
{
"epoch": 0.08518649252853276,
"grad_norm": 0.4226400852203369,
"learning_rate": 4.525e-05,
"loss": 8.9829,
"step": 181
},
{
"epoch": 0.08565713613366278,
"grad_norm": 0.3735749125480652,
"learning_rate": 4.55e-05,
"loss": 9.6609,
"step": 182
},
{
"epoch": 0.08612777973879279,
"grad_norm": 0.4413192868232727,
"learning_rate": 4.575e-05,
"loss": 9.0126,
"step": 183
},
{
"epoch": 0.08659842334392281,
"grad_norm": 0.3925839364528656,
"learning_rate": 4.600000000000001e-05,
"loss": 9.2048,
"step": 184
},
{
"epoch": 0.08706906694905282,
"grad_norm": 0.3941839933395386,
"learning_rate": 4.6250000000000006e-05,
"loss": 9.2662,
"step": 185
},
{
"epoch": 0.08753971055418285,
"grad_norm": 0.47577032446861267,
"learning_rate": 4.6500000000000005e-05,
"loss": 8.9474,
"step": 186
},
{
"epoch": 0.08801035415931285,
"grad_norm": 0.4306804835796356,
"learning_rate": 4.6750000000000005e-05,
"loss": 8.8199,
"step": 187
},
{
"epoch": 0.08848099776444288,
"grad_norm": 0.4680851995944977,
"learning_rate": 4.7e-05,
"loss": 8.7651,
"step": 188
},
{
"epoch": 0.08895164136957288,
"grad_norm": 0.4325461983680725,
"learning_rate": 4.7249999999999997e-05,
"loss": 9.1391,
"step": 189
},
{
"epoch": 0.0894222849747029,
"grad_norm": 0.7051356434822083,
"learning_rate": 4.75e-05,
"loss": 8.8018,
"step": 190
},
{
"epoch": 0.08989292857983291,
"grad_norm": 0.37214136123657227,
"learning_rate": 4.775e-05,
"loss": 9.4374,
"step": 191
},
{
"epoch": 0.09036357218496294,
"grad_norm": 0.4161190688610077,
"learning_rate": 4.8e-05,
"loss": 9.0213,
"step": 192
},
{
"epoch": 0.09083421579009295,
"grad_norm": 0.39017942547798157,
"learning_rate": 4.825e-05,
"loss": 9.4081,
"step": 193
},
{
"epoch": 0.09130485939522297,
"grad_norm": 0.3661479353904724,
"learning_rate": 4.85e-05,
"loss": 9.5162,
"step": 194
},
{
"epoch": 0.09177550300035298,
"grad_norm": 0.4220457077026367,
"learning_rate": 4.875e-05,
"loss": 8.8268,
"step": 195
},
{
"epoch": 0.092246146605483,
"grad_norm": 0.4123201370239258,
"learning_rate": 4.9e-05,
"loss": 9.1464,
"step": 196
},
{
"epoch": 0.092716790210613,
"grad_norm": 0.3835439383983612,
"learning_rate": 4.9250000000000004e-05,
"loss": 9.2391,
"step": 197
},
{
"epoch": 0.09318743381574303,
"grad_norm": 0.3718632459640503,
"learning_rate": 4.9500000000000004e-05,
"loss": 9.2759,
"step": 198
},
{
"epoch": 0.09365807742087304,
"grad_norm": 0.5267420411109924,
"learning_rate": 4.975e-05,
"loss": 9.0097,
"step": 199
},
{
"epoch": 0.09412872102600306,
"grad_norm": 0.3542408049106598,
"learning_rate": 5e-05,
"loss": 9.5282,
"step": 200
},
{
"epoch": 0.09459936463113307,
"grad_norm": 0.40344443917274475,
"learning_rate": 4.999999247114854e-05,
"loss": 9.3784,
"step": 201
},
{
"epoch": 0.09507000823626309,
"grad_norm": 0.41083309054374695,
"learning_rate": 4.999996988459869e-05,
"loss": 9.4365,
"step": 202
},
{
"epoch": 0.0955406518413931,
"grad_norm": 0.369400292634964,
"learning_rate": 4.9999932240364054e-05,
"loss": 9.3167,
"step": 203
},
{
"epoch": 0.09601129544652312,
"grad_norm": 0.36150887608528137,
"learning_rate": 4.9999879538467306e-05,
"loss": 9.5957,
"step": 204
},
{
"epoch": 0.09648193905165313,
"grad_norm": 0.44035205245018005,
"learning_rate": 4.99998117789402e-05,
"loss": 8.8501,
"step": 205
},
{
"epoch": 0.09695258265678315,
"grad_norm": 0.42898210883140564,
"learning_rate": 4.999972896182352e-05,
"loss": 8.8283,
"step": 206
},
{
"epoch": 0.09742322626191316,
"grad_norm": 0.3809720277786255,
"learning_rate": 4.999963108716718e-05,
"loss": 9.3219,
"step": 207
},
{
"epoch": 0.09789386986704318,
"grad_norm": 0.38228464126586914,
"learning_rate": 4.999951815503011e-05,
"loss": 9.2669,
"step": 208
},
{
"epoch": 0.09836451347217319,
"grad_norm": 0.3908674120903015,
"learning_rate": 4.9999390165480335e-05,
"loss": 8.9417,
"step": 209
},
{
"epoch": 0.09883515707730321,
"grad_norm": 0.34623146057128906,
"learning_rate": 4.999924711859495e-05,
"loss": 9.6014,
"step": 210
},
{
"epoch": 0.09930580068243322,
"grad_norm": 0.3909365236759186,
"learning_rate": 4.99990890144601e-05,
"loss": 9.1546,
"step": 211
},
{
"epoch": 0.09977644428756324,
"grad_norm": 0.3888709843158722,
"learning_rate": 4.999891585317103e-05,
"loss": 9.3649,
"step": 212
},
{
"epoch": 0.10024708789269325,
"grad_norm": 0.45398378372192383,
"learning_rate": 4.9998727634832024e-05,
"loss": 8.9172,
"step": 213
},
{
"epoch": 0.10071773149782327,
"grad_norm": 0.36648306250572205,
"learning_rate": 4.9998524359556445e-05,
"loss": 9.0638,
"step": 214
},
{
"epoch": 0.10118837510295328,
"grad_norm": 0.37433892488479614,
"learning_rate": 4.999830602746673e-05,
"loss": 9.3322,
"step": 215
},
{
"epoch": 0.1016590187080833,
"grad_norm": 0.38904431462287903,
"learning_rate": 4.99980726386944e-05,
"loss": 9.322,
"step": 216
},
{
"epoch": 0.10212966231321331,
"grad_norm": 0.38138681650161743,
"learning_rate": 4.9997824193380004e-05,
"loss": 9.6177,
"step": 217
},
{
"epoch": 0.10260030591834333,
"grad_norm": 0.39529645442962646,
"learning_rate": 4.9997560691673194e-05,
"loss": 9.054,
"step": 218
},
{
"epoch": 0.10307094952347334,
"grad_norm": 0.4126908481121063,
"learning_rate": 4.999728213373267e-05,
"loss": 9.4406,
"step": 219
},
{
"epoch": 0.10354159312860337,
"grad_norm": 0.4137309491634369,
"learning_rate": 4.999698851972622e-05,
"loss": 9.0403,
"step": 220
},
{
"epoch": 0.10401223673373337,
"grad_norm": 0.4086442291736603,
"learning_rate": 4.999667984983069e-05,
"loss": 9.3006,
"step": 221
},
{
"epoch": 0.1044828803388634,
"grad_norm": 0.5080444812774658,
"learning_rate": 4.999635612423198e-05,
"loss": 9.1856,
"step": 222
},
{
"epoch": 0.1049535239439934,
"grad_norm": 0.36199596524238586,
"learning_rate": 4.9996017343125085e-05,
"loss": 9.3119,
"step": 223
},
{
"epoch": 0.10542416754912343,
"grad_norm": 0.4086923897266388,
"learning_rate": 4.9995663506714054e-05,
"loss": 9.1335,
"step": 224
},
{
"epoch": 0.10589481115425343,
"grad_norm": 0.42041823267936707,
"learning_rate": 4.9995294615212006e-05,
"loss": 8.9113,
"step": 225
},
{
"epoch": 0.10636545475938346,
"grad_norm": 0.35369089245796204,
"learning_rate": 4.999491066884113e-05,
"loss": 9.4732,
"step": 226
},
{
"epoch": 0.10683609836451347,
"grad_norm": 0.8479387164115906,
"learning_rate": 4.9994511667832665e-05,
"loss": 9.1135,
"step": 227
},
{
"epoch": 0.10730674196964349,
"grad_norm": 0.38847988843917847,
"learning_rate": 4.999409761242696e-05,
"loss": 9.3632,
"step": 228
},
{
"epoch": 0.1077773855747735,
"grad_norm": 0.43660977482795715,
"learning_rate": 4.999366850287337e-05,
"loss": 8.6279,
"step": 229
},
{
"epoch": 0.10824802917990352,
"grad_norm": 0.6459296345710754,
"learning_rate": 4.999322433943038e-05,
"loss": 9.1736,
"step": 230
},
{
"epoch": 0.10871867278503353,
"grad_norm": 0.453952819108963,
"learning_rate": 4.99927651223655e-05,
"loss": 8.7847,
"step": 231
},
{
"epoch": 0.10918931639016355,
"grad_norm": 0.3641432821750641,
"learning_rate": 4.9992290851955325e-05,
"loss": 9.1591,
"step": 232
},
{
"epoch": 0.10965995999529356,
"grad_norm": 0.43097686767578125,
"learning_rate": 4.999180152848551e-05,
"loss": 8.8475,
"step": 233
},
{
"epoch": 0.11013060360042358,
"grad_norm": 0.40101760625839233,
"learning_rate": 4.999129715225077e-05,
"loss": 9.3003,
"step": 234
},
{
"epoch": 0.11060124720555359,
"grad_norm": 0.38456395268440247,
"learning_rate": 4.99907777235549e-05,
"loss": 9.0397,
"step": 235
},
{
"epoch": 0.11107189081068361,
"grad_norm": 0.3518768846988678,
"learning_rate": 4.9990243242710764e-05,
"loss": 9.3619,
"step": 236
},
{
"epoch": 0.11154253441581362,
"grad_norm": 0.43492040038108826,
"learning_rate": 4.9989693710040284e-05,
"loss": 8.9691,
"step": 237
},
{
"epoch": 0.11201317802094364,
"grad_norm": 0.4434773325920105,
"learning_rate": 4.998912912587444e-05,
"loss": 8.6355,
"step": 238
},
{
"epoch": 0.11248382162607365,
"grad_norm": 0.4103478193283081,
"learning_rate": 4.998854949055328e-05,
"loss": 9.0966,
"step": 239
},
{
"epoch": 0.11295446523120367,
"grad_norm": 0.409065842628479,
"learning_rate": 4.998795480442595e-05,
"loss": 8.9825,
"step": 240
},
{
"epoch": 0.11342510883633368,
"grad_norm": 0.3709560036659241,
"learning_rate": 4.9987345067850596e-05,
"loss": 9.383,
"step": 241
},
{
"epoch": 0.1138957524414637,
"grad_norm": 0.4049656391143799,
"learning_rate": 4.9986720281194496e-05,
"loss": 8.8382,
"step": 242
},
{
"epoch": 0.11436639604659371,
"grad_norm": 0.40016597509384155,
"learning_rate": 4.998608044483396e-05,
"loss": 9.0227,
"step": 243
},
{
"epoch": 0.11483703965172373,
"grad_norm": 0.41628897190093994,
"learning_rate": 4.998542555915435e-05,
"loss": 9.1208,
"step": 244
},
{
"epoch": 0.11530768325685374,
"grad_norm": 0.37839028239250183,
"learning_rate": 4.998475562455013e-05,
"loss": 9.2952,
"step": 245
},
{
"epoch": 0.11577832686198376,
"grad_norm": 0.37010782957077026,
"learning_rate": 4.99840706414248e-05,
"loss": 8.8903,
"step": 246
},
{
"epoch": 0.11624897046711377,
"grad_norm": 0.40624648332595825,
"learning_rate": 4.998337061019092e-05,
"loss": 9.1322,
"step": 247
},
{
"epoch": 0.1167196140722438,
"grad_norm": 0.330285906791687,
"learning_rate": 4.998265553127013e-05,
"loss": 9.3509,
"step": 248
},
{
"epoch": 0.1171902576773738,
"grad_norm": 0.4315396249294281,
"learning_rate": 4.9981925405093146e-05,
"loss": 8.5941,
"step": 249
},
{
"epoch": 0.11766090128250382,
"grad_norm": 0.46557149291038513,
"learning_rate": 4.99811802320997e-05,
"loss": 8.7841,
"step": 250
},
{
"epoch": 0.11813154488763383,
"grad_norm": 0.40763556957244873,
"learning_rate": 4.998042001273864e-05,
"loss": 9.0945,
"step": 251
},
{
"epoch": 0.11860218849276385,
"grad_norm": 0.38328826427459717,
"learning_rate": 4.9979644747467835e-05,
"loss": 9.5115,
"step": 252
},
{
"epoch": 0.11907283209789386,
"grad_norm": 0.3737850487232208,
"learning_rate": 4.997885443675424e-05,
"loss": 8.6629,
"step": 253
},
{
"epoch": 0.11954347570302389,
"grad_norm": 0.38939982652664185,
"learning_rate": 4.997804908107387e-05,
"loss": 9.1315,
"step": 254
},
{
"epoch": 0.1200141193081539,
"grad_norm": 0.41033586859703064,
"learning_rate": 4.997722868091179e-05,
"loss": 8.9948,
"step": 255
},
{
"epoch": 0.12048476291328392,
"grad_norm": 0.4496087431907654,
"learning_rate": 4.997639323676214e-05,
"loss": 8.7967,
"step": 256
},
{
"epoch": 0.12095540651841392,
"grad_norm": 0.4463037848472595,
"learning_rate": 4.997554274912811e-05,
"loss": 8.6575,
"step": 257
},
{
"epoch": 0.12142605012354395,
"grad_norm": 0.447477251291275,
"learning_rate": 4.997467721852196e-05,
"loss": 9.4086,
"step": 258
},
{
"epoch": 0.12189669372867395,
"grad_norm": 0.40504494309425354,
"learning_rate": 4.9973796645465e-05,
"loss": 9.6567,
"step": 259
},
{
"epoch": 0.12236733733380398,
"grad_norm": 0.4193851351737976,
"learning_rate": 4.9972901030487616e-05,
"loss": 9.415,
"step": 260
},
{
"epoch": 0.12283798093893399,
"grad_norm": 0.37490740418434143,
"learning_rate": 4.997199037412923e-05,
"loss": 9.094,
"step": 261
},
{
"epoch": 0.12330862454406401,
"grad_norm": 0.4043318033218384,
"learning_rate": 4.997106467693835e-05,
"loss": 9.1566,
"step": 262
},
{
"epoch": 0.12377926814919402,
"grad_norm": 0.3795372247695923,
"learning_rate": 4.997012393947253e-05,
"loss": 9.5975,
"step": 263
},
{
"epoch": 0.12424991175432404,
"grad_norm": 0.38997772336006165,
"learning_rate": 4.996916816229837e-05,
"loss": 9.3275,
"step": 264
},
{
"epoch": 0.12472055535945405,
"grad_norm": 0.41787171363830566,
"learning_rate": 4.9968197345991565e-05,
"loss": 8.9184,
"step": 265
},
{
"epoch": 0.12519119896458406,
"grad_norm": 0.4403538703918457,
"learning_rate": 4.996721149113682e-05,
"loss": 9.0055,
"step": 266
},
{
"epoch": 0.12566184256971408,
"grad_norm": 0.44756266474723816,
"learning_rate": 4.996621059832795e-05,
"loss": 9.0517,
"step": 267
},
{
"epoch": 0.1261324861748441,
"grad_norm": 0.3958662748336792,
"learning_rate": 4.996519466816778e-05,
"loss": 9.1983,
"step": 268
},
{
"epoch": 0.12660312977997412,
"grad_norm": 0.5548920035362244,
"learning_rate": 4.9964163701268224e-05,
"loss": 9.0239,
"step": 269
},
{
"epoch": 0.12707377338510412,
"grad_norm": 0.38231074810028076,
"learning_rate": 4.996311769825024e-05,
"loss": 9.4057,
"step": 270
},
{
"epoch": 0.12754441699023414,
"grad_norm": 0.37411412596702576,
"learning_rate": 4.996205665974384e-05,
"loss": 9.147,
"step": 271
},
{
"epoch": 0.12801506059536416,
"grad_norm": 0.36638572812080383,
"learning_rate": 4.996098058638809e-05,
"loss": 9.3312,
"step": 272
},
{
"epoch": 0.12848570420049418,
"grad_norm": 0.36364972591400146,
"learning_rate": 4.995988947883114e-05,
"loss": 9.4873,
"step": 273
},
{
"epoch": 0.12895634780562418,
"grad_norm": 0.415054053068161,
"learning_rate": 4.9958783337730156e-05,
"loss": 9.0241,
"step": 274
},
{
"epoch": 0.1294269914107542,
"grad_norm": 0.616145133972168,
"learning_rate": 4.995766216375137e-05,
"loss": 9.1209,
"step": 275
},
{
"epoch": 0.12989763501588422,
"grad_norm": 0.3728233575820923,
"learning_rate": 4.9956525957570086e-05,
"loss": 9.5214,
"step": 276
},
{
"epoch": 0.13036827862101424,
"grad_norm": 0.4377942681312561,
"learning_rate": 4.995537471987066e-05,
"loss": 8.7668,
"step": 277
},
{
"epoch": 0.13083892222614424,
"grad_norm": 0.4865539073944092,
"learning_rate": 4.9954208451346465e-05,
"loss": 8.8752,
"step": 278
},
{
"epoch": 0.13130956583127426,
"grad_norm": 0.4728136658668518,
"learning_rate": 4.995302715269997e-05,
"loss": 9.0947,
"step": 279
},
{
"epoch": 0.13178020943640428,
"grad_norm": 0.40794286131858826,
"learning_rate": 4.995183082464269e-05,
"loss": 8.9566,
"step": 280
},
{
"epoch": 0.1322508530415343,
"grad_norm": 0.35321590304374695,
"learning_rate": 4.995061946789516e-05,
"loss": 9.4166,
"step": 281
},
{
"epoch": 0.1327214966466643,
"grad_norm": 0.41053611040115356,
"learning_rate": 4.9949393083187005e-05,
"loss": 9.0913,
"step": 282
},
{
"epoch": 0.13319214025179432,
"grad_norm": 0.4475056231021881,
"learning_rate": 4.9948151671256883e-05,
"loss": 8.422,
"step": 283
},
{
"epoch": 0.13366278385692434,
"grad_norm": 0.34866318106651306,
"learning_rate": 4.994689523285251e-05,
"loss": 9.2168,
"step": 284
},
{
"epoch": 0.13413342746205437,
"grad_norm": 0.4374255836009979,
"learning_rate": 4.994562376873064e-05,
"loss": 8.9508,
"step": 285
},
{
"epoch": 0.13460407106718436,
"grad_norm": 0.38839930295944214,
"learning_rate": 4.9944337279657106e-05,
"loss": 8.8695,
"step": 286
},
{
"epoch": 0.13507471467231438,
"grad_norm": 0.4352591335773468,
"learning_rate": 4.994303576640674e-05,
"loss": 8.7637,
"step": 287
},
{
"epoch": 0.1355453582774444,
"grad_norm": 0.36577296257019043,
"learning_rate": 4.994171922976348e-05,
"loss": 9.4622,
"step": 288
},
{
"epoch": 0.13601600188257443,
"grad_norm": 0.3764691650867462,
"learning_rate": 4.994038767052028e-05,
"loss": 9.3536,
"step": 289
},
{
"epoch": 0.13648664548770442,
"grad_norm": 0.3795958161354065,
"learning_rate": 4.993904108947914e-05,
"loss": 8.9066,
"step": 290
},
{
"epoch": 0.13695728909283444,
"grad_norm": 0.42235082387924194,
"learning_rate": 4.993767948745113e-05,
"loss": 9.168,
"step": 291
},
{
"epoch": 0.13742793269796447,
"grad_norm": 0.41240936517715454,
"learning_rate": 4.993630286525634e-05,
"loss": 8.8015,
"step": 292
},
{
"epoch": 0.1378985763030945,
"grad_norm": 0.40508440136909485,
"learning_rate": 4.993491122372394e-05,
"loss": 8.9218,
"step": 293
},
{
"epoch": 0.13836921990822448,
"grad_norm": 0.44761571288108826,
"learning_rate": 4.99335045636921e-05,
"loss": 8.9542,
"step": 294
},
{
"epoch": 0.1388398635133545,
"grad_norm": 0.35136064887046814,
"learning_rate": 4.993208288600808e-05,
"loss": 9.0036,
"step": 295
},
{
"epoch": 0.13931050711848453,
"grad_norm": 0.3560550808906555,
"learning_rate": 4.9930646191528175e-05,
"loss": 9.5513,
"step": 296
},
{
"epoch": 0.13978115072361455,
"grad_norm": 0.40760746598243713,
"learning_rate": 4.99291944811177e-05,
"loss": 9.1574,
"step": 297
},
{
"epoch": 0.14025179432874454,
"grad_norm": 0.4152514338493347,
"learning_rate": 4.992772775565104e-05,
"loss": 8.9221,
"step": 298
},
{
"epoch": 0.14072243793387457,
"grad_norm": 0.36200031638145447,
"learning_rate": 4.992624601601162e-05,
"loss": 9.2766,
"step": 299
},
{
"epoch": 0.1411930815390046,
"grad_norm": 0.3931048512458801,
"learning_rate": 4.992474926309191e-05,
"loss": 9.0796,
"step": 300
},
{
"epoch": 0.1416637251441346,
"grad_norm": 0.3852521777153015,
"learning_rate": 4.992323749779339e-05,
"loss": 8.9804,
"step": 301
},
{
"epoch": 0.1421343687492646,
"grad_norm": 0.42558741569519043,
"learning_rate": 4.992171072102663e-05,
"loss": 8.6188,
"step": 302
},
{
"epoch": 0.14260501235439463,
"grad_norm": 0.40560707449913025,
"learning_rate": 4.992016893371122e-05,
"loss": 9.2215,
"step": 303
},
{
"epoch": 0.14307565595952465,
"grad_norm": 0.3654381334781647,
"learning_rate": 4.9918612136775776e-05,
"loss": 9.6141,
"step": 304
},
{
"epoch": 0.14354629956465467,
"grad_norm": 0.3547174632549286,
"learning_rate": 4.9917040331157986e-05,
"loss": 9.4322,
"step": 305
},
{
"epoch": 0.14401694316978467,
"grad_norm": 0.3975953161716461,
"learning_rate": 4.9915453517804554e-05,
"loss": 9.0455,
"step": 306
},
{
"epoch": 0.1444875867749147,
"grad_norm": 0.4045639932155609,
"learning_rate": 4.991385169767123e-05,
"loss": 8.6646,
"step": 307
},
{
"epoch": 0.1449582303800447,
"grad_norm": 0.39949241280555725,
"learning_rate": 4.9912234871722805e-05,
"loss": 8.9656,
"step": 308
},
{
"epoch": 0.14542887398517473,
"grad_norm": 0.38490548729896545,
"learning_rate": 4.9910603040933116e-05,
"loss": 9.2289,
"step": 309
},
{
"epoch": 0.14589951759030473,
"grad_norm": 0.38393279910087585,
"learning_rate": 4.9908956206285e-05,
"loss": 9.5308,
"step": 310
},
{
"epoch": 0.14637016119543475,
"grad_norm": 0.41801533102989197,
"learning_rate": 4.990729436877038e-05,
"loss": 9.179,
"step": 311
},
{
"epoch": 0.14684080480056477,
"grad_norm": 0.3734685182571411,
"learning_rate": 4.9905617529390203e-05,
"loss": 9.4323,
"step": 312
},
{
"epoch": 0.1473114484056948,
"grad_norm": 0.38498827815055847,
"learning_rate": 4.9903925689154425e-05,
"loss": 8.7253,
"step": 313
},
{
"epoch": 0.1477820920108248,
"grad_norm": 0.4148082435131073,
"learning_rate": 4.990221884908206e-05,
"loss": 9.5291,
"step": 314
},
{
"epoch": 0.1482527356159548,
"grad_norm": 0.3645360469818115,
"learning_rate": 4.990049701020115e-05,
"loss": 9.3854,
"step": 315
},
{
"epoch": 0.14872337922108483,
"grad_norm": 0.39119553565979004,
"learning_rate": 4.989876017354878e-05,
"loss": 8.8417,
"step": 316
},
{
"epoch": 0.14919402282621486,
"grad_norm": 0.40799564123153687,
"learning_rate": 4.989700834017105e-05,
"loss": 9.1028,
"step": 317
},
{
"epoch": 0.14966466643134485,
"grad_norm": 0.36694031953811646,
"learning_rate": 4.9895241511123114e-05,
"loss": 9.26,
"step": 318
},
{
"epoch": 0.15013531003647487,
"grad_norm": 0.4914778769016266,
"learning_rate": 4.989345968746914e-05,
"loss": 9.3256,
"step": 319
},
{
"epoch": 0.1506059536416049,
"grad_norm": 0.43579304218292236,
"learning_rate": 4.989166287028234e-05,
"loss": 8.7753,
"step": 320
},
{
"epoch": 0.15107659724673492,
"grad_norm": 0.37302032113075256,
"learning_rate": 4.988985106064495e-05,
"loss": 9.3832,
"step": 321
},
{
"epoch": 0.1515472408518649,
"grad_norm": 0.3695763945579529,
"learning_rate": 4.988802425964824e-05,
"loss": 8.7549,
"step": 322
},
{
"epoch": 0.15201788445699493,
"grad_norm": 0.4146966338157654,
"learning_rate": 4.98861824683925e-05,
"loss": 8.8819,
"step": 323
},
{
"epoch": 0.15248852806212496,
"grad_norm": 0.36729514598846436,
"learning_rate": 4.9884325687987056e-05,
"loss": 8.9922,
"step": 324
},
{
"epoch": 0.15295917166725498,
"grad_norm": 0.3997980058193207,
"learning_rate": 4.9882453919550264e-05,
"loss": 9.0574,
"step": 325
},
{
"epoch": 0.15342981527238497,
"grad_norm": 0.31628280878067017,
"learning_rate": 4.9880567164209515e-05,
"loss": 9.7555,
"step": 326
},
{
"epoch": 0.153900458877515,
"grad_norm": 0.3956843316555023,
"learning_rate": 4.98786654231012e-05,
"loss": 9.2441,
"step": 327
},
{
"epoch": 0.15437110248264502,
"grad_norm": 0.399984747171402,
"learning_rate": 4.987674869737077e-05,
"loss": 9.0811,
"step": 328
},
{
"epoch": 0.15484174608777504,
"grad_norm": 0.40124884247779846,
"learning_rate": 4.987481698817268e-05,
"loss": 8.7801,
"step": 329
},
{
"epoch": 0.15531238969290503,
"grad_norm": 0.36277976632118225,
"learning_rate": 4.98728702966704e-05,
"loss": 9.1685,
"step": 330
},
{
"epoch": 0.15578303329803506,
"grad_norm": 0.4415287375450134,
"learning_rate": 4.987090862403646e-05,
"loss": 8.6159,
"step": 331
},
{
"epoch": 0.15625367690316508,
"grad_norm": 0.4005844295024872,
"learning_rate": 4.986893197145237e-05,
"loss": 8.7962,
"step": 332
},
{
"epoch": 0.1567243205082951,
"grad_norm": 0.4147176742553711,
"learning_rate": 4.9866940340108704e-05,
"loss": 9.1667,
"step": 333
},
{
"epoch": 0.1571949641134251,
"grad_norm": 0.5922366976737976,
"learning_rate": 4.986493373120502e-05,
"loss": 9.1685,
"step": 334
},
{
"epoch": 0.15766560771855512,
"grad_norm": 0.42389023303985596,
"learning_rate": 4.986291214594992e-05,
"loss": 8.9005,
"step": 335
},
{
"epoch": 0.15813625132368514,
"grad_norm": 3.3356659412384033,
"learning_rate": 4.986087558556104e-05,
"loss": 8.8868,
"step": 336
},
{
"epoch": 0.15860689492881516,
"grad_norm": 0.3584047853946686,
"learning_rate": 4.9858824051264985e-05,
"loss": 9.3012,
"step": 337
},
{
"epoch": 0.15907753853394516,
"grad_norm": 0.432365357875824,
"learning_rate": 4.985675754429744e-05,
"loss": 8.6683,
"step": 338
},
{
"epoch": 0.15954818213907518,
"grad_norm": 0.4141758680343628,
"learning_rate": 4.985467606590305e-05,
"loss": 8.8902,
"step": 339
},
{
"epoch": 0.1600188257442052,
"grad_norm": 0.5318158268928528,
"learning_rate": 4.985257961733553e-05,
"loss": 9.3213,
"step": 340
},
{
"epoch": 0.16048946934933522,
"grad_norm": 0.4039144814014435,
"learning_rate": 4.985046819985758e-05,
"loss": 9.3521,
"step": 341
},
{
"epoch": 0.16096011295446522,
"grad_norm": 0.4055419862270355,
"learning_rate": 4.984834181474093e-05,
"loss": 9.032,
"step": 342
},
{
"epoch": 0.16143075655959524,
"grad_norm": 0.47234630584716797,
"learning_rate": 4.9846200463266304e-05,
"loss": 8.9415,
"step": 343
},
{
"epoch": 0.16190140016472526,
"grad_norm": 0.3458828628063202,
"learning_rate": 4.984404414672346e-05,
"loss": 9.3418,
"step": 344
},
{
"epoch": 0.16237204376985528,
"grad_norm": 0.4208340048789978,
"learning_rate": 4.9841872866411175e-05,
"loss": 8.5468,
"step": 345
},
{
"epoch": 0.16284268737498528,
"grad_norm": 0.4632960855960846,
"learning_rate": 4.983968662363723e-05,
"loss": 8.357,
"step": 346
},
{
"epoch": 0.1633133309801153,
"grad_norm": 0.3957667946815491,
"learning_rate": 4.98374854197184e-05,
"loss": 9.5873,
"step": 347
},
{
"epoch": 0.16378397458524532,
"grad_norm": 0.45077890157699585,
"learning_rate": 4.98352692559805e-05,
"loss": 8.6973,
"step": 348
},
{
"epoch": 0.16425461819037535,
"grad_norm": 0.36463478207588196,
"learning_rate": 4.983303813375833e-05,
"loss": 9.1421,
"step": 349
},
{
"epoch": 0.16472526179550534,
"grad_norm": 0.4010748565196991,
"learning_rate": 4.983079205439574e-05,
"loss": 9.1377,
"step": 350
},
{
"epoch": 0.16519590540063536,
"grad_norm": 0.39440232515335083,
"learning_rate": 4.982853101924554e-05,
"loss": 8.9753,
"step": 351
},
{
"epoch": 0.16566654900576538,
"grad_norm": 0.4520394504070282,
"learning_rate": 4.9826255029669577e-05,
"loss": 8.7352,
"step": 352
},
{
"epoch": 0.1661371926108954,
"grad_norm": 0.4330653250217438,
"learning_rate": 4.98239640870387e-05,
"loss": 9.0555,
"step": 353
},
{
"epoch": 0.1666078362160254,
"grad_norm": 0.47660115361213684,
"learning_rate": 4.982165819273275e-05,
"loss": 8.6404,
"step": 354
},
{
"epoch": 0.16707847982115542,
"grad_norm": 0.4233279228210449,
"learning_rate": 4.98193373481406e-05,
"loss": 8.9099,
"step": 355
},
{
"epoch": 0.16754912342628545,
"grad_norm": 0.43518248200416565,
"learning_rate": 4.98170015546601e-05,
"loss": 8.6882,
"step": 356
},
{
"epoch": 0.16801976703141547,
"grad_norm": 0.3644963800907135,
"learning_rate": 4.981465081369814e-05,
"loss": 9.2448,
"step": 357
},
{
"epoch": 0.16849041063654546,
"grad_norm": 0.38815975189208984,
"learning_rate": 4.981228512667057e-05,
"loss": 9.558,
"step": 358
},
{
"epoch": 0.16896105424167548,
"grad_norm": 0.4271330237388611,
"learning_rate": 4.980990449500227e-05,
"loss": 8.4688,
"step": 359
},
{
"epoch": 0.1694316978468055,
"grad_norm": 0.4300340712070465,
"learning_rate": 4.980750892012711e-05,
"loss": 8.5112,
"step": 360
},
{
"epoch": 0.16990234145193553,
"grad_norm": 0.3674795627593994,
"learning_rate": 4.980509840348796e-05,
"loss": 9.1979,
"step": 361
},
{
"epoch": 0.17037298505706552,
"grad_norm": 0.39522647857666016,
"learning_rate": 4.980267294653671e-05,
"loss": 9.3743,
"step": 362
},
{
"epoch": 0.17084362866219555,
"grad_norm": 0.4358430504798889,
"learning_rate": 4.980023255073422e-05,
"loss": 9.1216,
"step": 363
},
{
"epoch": 0.17131427226732557,
"grad_norm": 0.40390607714653015,
"learning_rate": 4.9797777217550367e-05,
"loss": 8.9767,
"step": 364
},
{
"epoch": 0.1717849158724556,
"grad_norm": 0.3644031584262848,
"learning_rate": 4.9795306948464e-05,
"loss": 9.2284,
"step": 365
},
{
"epoch": 0.17225555947758558,
"grad_norm": 0.41837140917778015,
"learning_rate": 4.979282174496302e-05,
"loss": 8.8997,
"step": 366
},
{
"epoch": 0.1727262030827156,
"grad_norm": 0.38197219371795654,
"learning_rate": 4.979032160854424e-05,
"loss": 9.1135,
"step": 367
},
{
"epoch": 0.17319684668784563,
"grad_norm": 0.3703914284706116,
"learning_rate": 4.9787806540713546e-05,
"loss": 9.499,
"step": 368
},
{
"epoch": 0.17366749029297565,
"grad_norm": 0.5900145769119263,
"learning_rate": 4.978527654298576e-05,
"loss": 9.6679,
"step": 369
},
{
"epoch": 0.17413813389810565,
"grad_norm": 0.4443458318710327,
"learning_rate": 4.9782731616884736e-05,
"loss": 8.4039,
"step": 370
},
{
"epoch": 0.17460877750323567,
"grad_norm": 0.31717589497566223,
"learning_rate": 4.978017176394331e-05,
"loss": 9.7594,
"step": 371
},
{
"epoch": 0.1750794211083657,
"grad_norm": 0.3682294189929962,
"learning_rate": 4.977759698570328e-05,
"loss": 9.3738,
"step": 372
},
{
"epoch": 0.1755500647134957,
"grad_norm": 0.36333027482032776,
"learning_rate": 4.977500728371547e-05,
"loss": 9.4728,
"step": 373
},
{
"epoch": 0.1760207083186257,
"grad_norm": 0.38923901319503784,
"learning_rate": 4.9772402659539674e-05,
"loss": 9.0362,
"step": 374
},
{
"epoch": 0.17649135192375573,
"grad_norm": 0.3548789620399475,
"learning_rate": 4.9769783114744686e-05,
"loss": 9.4734,
"step": 375
},
{
"epoch": 0.17696199552888575,
"grad_norm": 0.3727724552154541,
"learning_rate": 4.976714865090827e-05,
"loss": 8.9019,
"step": 376
},
{
"epoch": 0.17743263913401577,
"grad_norm": 0.3825220763683319,
"learning_rate": 4.976449926961719e-05,
"loss": 9.4008,
"step": 377
},
{
"epoch": 0.17790328273914577,
"grad_norm": 0.36432167887687683,
"learning_rate": 4.9761834972467185e-05,
"loss": 9.4614,
"step": 378
},
{
"epoch": 0.1783739263442758,
"grad_norm": 0.4360719621181488,
"learning_rate": 4.975915576106299e-05,
"loss": 8.9864,
"step": 379
},
{
"epoch": 0.1788445699494058,
"grad_norm": 0.36198675632476807,
"learning_rate": 4.975646163701831e-05,
"loss": 9.3858,
"step": 380
},
{
"epoch": 0.17931521355453583,
"grad_norm": 0.3615058362483978,
"learning_rate": 4.9753752601955836e-05,
"loss": 9.4513,
"step": 381
},
{
"epoch": 0.17978585715966583,
"grad_norm": 0.38385000824928284,
"learning_rate": 4.975102865750725e-05,
"loss": 9.0129,
"step": 382
},
{
"epoch": 0.18025650076479585,
"grad_norm": 0.42161351442337036,
"learning_rate": 4.9748289805313196e-05,
"loss": 8.8066,
"step": 383
},
{
"epoch": 0.18072714436992587,
"grad_norm": 0.3863692879676819,
"learning_rate": 4.9745536047023324e-05,
"loss": 9.0613,
"step": 384
},
{
"epoch": 0.1811977879750559,
"grad_norm": 0.35685333609580994,
"learning_rate": 4.9742767384296216e-05,
"loss": 9.1823,
"step": 385
},
{
"epoch": 0.1816684315801859,
"grad_norm": 0.4146454930305481,
"learning_rate": 4.973998381879949e-05,
"loss": 9.0627,
"step": 386
},
{
"epoch": 0.1821390751853159,
"grad_norm": 0.40701958537101746,
"learning_rate": 4.973718535220969e-05,
"loss": 9.4653,
"step": 387
},
{
"epoch": 0.18260971879044594,
"grad_norm": 0.5105063915252686,
"learning_rate": 4.973437198621237e-05,
"loss": 9.1349,
"step": 388
},
{
"epoch": 0.18308036239557596,
"grad_norm": 0.3464662730693817,
"learning_rate": 4.973154372250203e-05,
"loss": 9.3152,
"step": 389
},
{
"epoch": 0.18355100600070595,
"grad_norm": 0.3519923985004425,
"learning_rate": 4.972870056278216e-05,
"loss": 9.6833,
"step": 390
},
{
"epoch": 0.18402164960583597,
"grad_norm": 0.3777810037136078,
"learning_rate": 4.972584250876522e-05,
"loss": 8.9543,
"step": 391
},
{
"epoch": 0.184492293210966,
"grad_norm": 0.45620018243789673,
"learning_rate": 4.972296956217265e-05,
"loss": 8.5477,
"step": 392
},
{
"epoch": 0.18496293681609602,
"grad_norm": 0.3768126368522644,
"learning_rate": 4.972008172473483e-05,
"loss": 9.2837,
"step": 393
},
{
"epoch": 0.185433580421226,
"grad_norm": 0.37716034054756165,
"learning_rate": 4.971717899819113e-05,
"loss": 9.0821,
"step": 394
},
{
"epoch": 0.18590422402635604,
"grad_norm": 0.40171629190444946,
"learning_rate": 4.9714261384289896e-05,
"loss": 9.0963,
"step": 395
},
{
"epoch": 0.18637486763148606,
"grad_norm": 0.41346555948257446,
"learning_rate": 4.9711328884788434e-05,
"loss": 8.6835,
"step": 396
},
{
"epoch": 0.18684551123661608,
"grad_norm": 0.3882580101490021,
"learning_rate": 4.970838150145299e-05,
"loss": 8.998,
"step": 397
},
{
"epoch": 0.18731615484174607,
"grad_norm": 0.40618547797203064,
"learning_rate": 4.9705419236058825e-05,
"loss": 8.8586,
"step": 398
},
{
"epoch": 0.1877867984468761,
"grad_norm": 0.4610426127910614,
"learning_rate": 4.970244209039012e-05,
"loss": 8.5731,
"step": 399
},
{
"epoch": 0.18825744205200612,
"grad_norm": 0.3799988329410553,
"learning_rate": 4.969945006624003e-05,
"loss": 8.9463,
"step": 400
},
{
"epoch": 0.18872808565713614,
"grad_norm": 0.37528830766677856,
"learning_rate": 4.969644316541068e-05,
"loss": 8.9402,
"step": 401
},
{
"epoch": 0.18919872926226614,
"grad_norm": 0.3422936201095581,
"learning_rate": 4.9693421389713156e-05,
"loss": 9.3497,
"step": 402
},
{
"epoch": 0.18966937286739616,
"grad_norm": 0.35784366726875305,
"learning_rate": 4.969038474096749e-05,
"loss": 9.1984,
"step": 403
},
{
"epoch": 0.19014001647252618,
"grad_norm": 0.36203494668006897,
"learning_rate": 4.96873332210027e-05,
"loss": 9.5096,
"step": 404
},
{
"epoch": 0.1906106600776562,
"grad_norm": 0.3657507598400116,
"learning_rate": 4.9684266831656706e-05,
"loss": 9.4901,
"step": 405
},
{
"epoch": 0.1910813036827862,
"grad_norm": 0.3886093199253082,
"learning_rate": 4.9681185574776446e-05,
"loss": 9.2492,
"step": 406
},
{
"epoch": 0.19155194728791622,
"grad_norm": 0.4091348350048065,
"learning_rate": 4.967808945221778e-05,
"loss": 8.9341,
"step": 407
},
{
"epoch": 0.19202259089304624,
"grad_norm": 0.45772606134414673,
"learning_rate": 4.967497846584552e-05,
"loss": 9.1159,
"step": 408
},
{
"epoch": 0.19249323449817626,
"grad_norm": 0.4274662733078003,
"learning_rate": 4.967185261753345e-05,
"loss": 9.0557,
"step": 409
},
{
"epoch": 0.19296387810330626,
"grad_norm": 0.3963877558708191,
"learning_rate": 4.96687119091643e-05,
"loss": 9.2221,
"step": 410
},
{
"epoch": 0.19343452170843628,
"grad_norm": 0.3958019018173218,
"learning_rate": 4.966555634262972e-05,
"loss": 8.7826,
"step": 411
},
{
"epoch": 0.1939051653135663,
"grad_norm": 0.3447028398513794,
"learning_rate": 4.9662385919830347e-05,
"loss": 9.5672,
"step": 412
},
{
"epoch": 0.19437580891869632,
"grad_norm": 0.41687721014022827,
"learning_rate": 4.965920064267575e-05,
"loss": 8.7692,
"step": 413
},
{
"epoch": 0.19484645252382632,
"grad_norm": 0.40204861760139465,
"learning_rate": 4.9656000513084455e-05,
"loss": 8.9861,
"step": 414
},
{
"epoch": 0.19531709612895634,
"grad_norm": 0.3969802260398865,
"learning_rate": 4.965278553298392e-05,
"loss": 8.7663,
"step": 415
},
{
"epoch": 0.19578773973408636,
"grad_norm": 0.3831544518470764,
"learning_rate": 4.964955570431055e-05,
"loss": 9.1338,
"step": 416
},
{
"epoch": 0.19625838333921639,
"grad_norm": 0.40865185856819153,
"learning_rate": 4.96463110290097e-05,
"loss": 8.7582,
"step": 417
},
{
"epoch": 0.19672902694434638,
"grad_norm": 0.36668238043785095,
"learning_rate": 4.964305150903566e-05,
"loss": 9.185,
"step": 418
},
{
"epoch": 0.1971996705494764,
"grad_norm": 0.4229344129562378,
"learning_rate": 4.963977714635168e-05,
"loss": 9.0629,
"step": 419
},
{
"epoch": 0.19767031415460642,
"grad_norm": 0.36557090282440186,
"learning_rate": 4.963648794292992e-05,
"loss": 9.2807,
"step": 420
},
{
"epoch": 0.19814095775973645,
"grad_norm": 0.36382701992988586,
"learning_rate": 4.9633183900751504e-05,
"loss": 9.3589,
"step": 421
},
{
"epoch": 0.19861160136486644,
"grad_norm": 0.34733355045318604,
"learning_rate": 4.962986502180648e-05,
"loss": 9.246,
"step": 422
},
{
"epoch": 0.19908224496999646,
"grad_norm": 0.39794841408729553,
"learning_rate": 4.962653130809383e-05,
"loss": 8.8009,
"step": 423
},
{
"epoch": 0.19955288857512649,
"grad_norm": 1.290969967842102,
"learning_rate": 4.962318276162148e-05,
"loss": 8.8199,
"step": 424
},
{
"epoch": 0.2000235321802565,
"grad_norm": 0.41390761733055115,
"learning_rate": 4.961981938440629e-05,
"loss": 8.8504,
"step": 425
},
{
"epoch": 0.2004941757853865,
"grad_norm": 0.4563705623149872,
"learning_rate": 4.9616441178474044e-05,
"loss": 8.4598,
"step": 426
},
{
"epoch": 0.20096481939051652,
"grad_norm": 0.41248825192451477,
"learning_rate": 4.9613048145859465e-05,
"loss": 8.9862,
"step": 427
},
{
"epoch": 0.20143546299564655,
"grad_norm": 0.3711670935153961,
"learning_rate": 4.9609640288606205e-05,
"loss": 9.1376,
"step": 428
},
{
"epoch": 0.20190610660077657,
"grad_norm": 0.3998201787471771,
"learning_rate": 4.960621760876686e-05,
"loss": 8.8631,
"step": 429
},
{
"epoch": 0.20237675020590656,
"grad_norm": 0.39512693881988525,
"learning_rate": 4.96027801084029e-05,
"loss": 8.6108,
"step": 430
},
{
"epoch": 0.20284739381103659,
"grad_norm": 0.40403223037719727,
"learning_rate": 4.95993277895848e-05,
"loss": 8.9947,
"step": 431
},
{
"epoch": 0.2033180374161666,
"grad_norm": 0.37190157175064087,
"learning_rate": 4.959586065439189e-05,
"loss": 9.0393,
"step": 432
},
{
"epoch": 0.20378868102129663,
"grad_norm": 0.49797308444976807,
"learning_rate": 4.959237870491247e-05,
"loss": 8.4229,
"step": 433
},
{
"epoch": 0.20425932462642662,
"grad_norm": 0.4093763828277588,
"learning_rate": 4.958888194324374e-05,
"loss": 9.2132,
"step": 434
},
{
"epoch": 0.20472996823155665,
"grad_norm": 0.4164353609085083,
"learning_rate": 4.958537037149183e-05,
"loss": 9.3971,
"step": 435
},
{
"epoch": 0.20520061183668667,
"grad_norm": 0.4578768312931061,
"learning_rate": 4.958184399177178e-05,
"loss": 8.8712,
"step": 436
},
{
"epoch": 0.2056712554418167,
"grad_norm": 0.3586215674877167,
"learning_rate": 4.957830280620758e-05,
"loss": 9.3741,
"step": 437
},
{
"epoch": 0.20614189904694669,
"grad_norm": 0.4265285134315491,
"learning_rate": 4.9574746816932084e-05,
"loss": 9.5791,
"step": 438
},
{
"epoch": 0.2066125426520767,
"grad_norm": 0.4029577672481537,
"learning_rate": 4.9571176026087116e-05,
"loss": 8.7589,
"step": 439
},
{
"epoch": 0.20708318625720673,
"grad_norm": 0.38180944323539734,
"learning_rate": 4.9567590435823383e-05,
"loss": 9.0139,
"step": 440
},
{
"epoch": 0.20755382986233675,
"grad_norm": 0.39456745982170105,
"learning_rate": 4.9563990048300524e-05,
"loss": 9.1201,
"step": 441
},
{
"epoch": 0.20802447346746675,
"grad_norm": 0.5495271682739258,
"learning_rate": 4.956037486568706e-05,
"loss": 8.5788,
"step": 442
},
{
"epoch": 0.20849511707259677,
"grad_norm": 0.4691711366176605,
"learning_rate": 4.9556744890160477e-05,
"loss": 8.6122,
"step": 443
},
{
"epoch": 0.2089657606777268,
"grad_norm": 0.42626431584358215,
"learning_rate": 4.955310012390711e-05,
"loss": 9.0031,
"step": 444
},
{
"epoch": 0.20943640428285681,
"grad_norm": 0.3541715145111084,
"learning_rate": 4.954944056912224e-05,
"loss": 9.3784,
"step": 445
},
{
"epoch": 0.2099070478879868,
"grad_norm": 0.3353878855705261,
"learning_rate": 4.954576622801006e-05,
"loss": 9.2536,
"step": 446
},
{
"epoch": 0.21037769149311683,
"grad_norm": 0.45526987314224243,
"learning_rate": 4.954207710278364e-05,
"loss": 8.8725,
"step": 447
},
{
"epoch": 0.21084833509824685,
"grad_norm": 0.3993997275829315,
"learning_rate": 4.953837319566497e-05,
"loss": 8.7531,
"step": 448
},
{
"epoch": 0.21131897870337688,
"grad_norm": 0.4544302821159363,
"learning_rate": 4.953465450888495e-05,
"loss": 8.6906,
"step": 449
},
{
"epoch": 0.21178962230850687,
"grad_norm": 0.35516420006752014,
"learning_rate": 4.9530921044683374e-05,
"loss": 9.0749,
"step": 450
},
{
"epoch": 0.2122602659136369,
"grad_norm": 1.6792665719985962,
"learning_rate": 4.9527172805308944e-05,
"loss": 9.2437,
"step": 451
},
{
"epoch": 0.21273090951876691,
"grad_norm": 0.46345287561416626,
"learning_rate": 4.952340979301924e-05,
"loss": 9.0281,
"step": 452
},
{
"epoch": 0.21320155312389694,
"grad_norm": 0.447298139333725,
"learning_rate": 4.951963201008076e-05,
"loss": 8.9642,
"step": 453
},
{
"epoch": 0.21367219672902693,
"grad_norm": 0.4767840504646301,
"learning_rate": 4.9515839458768905e-05,
"loss": 8.359,
"step": 454
},
{
"epoch": 0.21414284033415695,
"grad_norm": 0.4263994097709656,
"learning_rate": 4.9512032141367946e-05,
"loss": 9.1196,
"step": 455
},
{
"epoch": 0.21461348393928698,
"grad_norm": 0.4342626929283142,
"learning_rate": 4.950821006017107e-05,
"loss": 8.6583,
"step": 456
},
{
"epoch": 0.215084127544417,
"grad_norm": 0.3934561610221863,
"learning_rate": 4.950437321748034e-05,
"loss": 9.0519,
"step": 457
},
{
"epoch": 0.215554771149547,
"grad_norm": 0.4860813319683075,
"learning_rate": 4.9500521615606716e-05,
"loss": 8.5634,
"step": 458
},
{
"epoch": 0.21602541475467701,
"grad_norm": 0.35411691665649414,
"learning_rate": 4.949665525687005e-05,
"loss": 9.1898,
"step": 459
},
{
"epoch": 0.21649605835980704,
"grad_norm": 0.4290132224559784,
"learning_rate": 4.94927741435991e-05,
"loss": 8.9995,
"step": 460
},
{
"epoch": 0.21696670196493706,
"grad_norm": 0.3373097777366638,
"learning_rate": 4.948887827813147e-05,
"loss": 9.3386,
"step": 461
},
{
"epoch": 0.21743734557006705,
"grad_norm": 0.42341887950897217,
"learning_rate": 4.948496766281368e-05,
"loss": 9.3743,
"step": 462
},
{
"epoch": 0.21790798917519708,
"grad_norm": 0.3915397524833679,
"learning_rate": 4.9481042300001124e-05,
"loss": 9.1503,
"step": 463
},
{
"epoch": 0.2183786327803271,
"grad_norm": 0.4155285954475403,
"learning_rate": 4.947710219205808e-05,
"loss": 9.0803,
"step": 464
},
{
"epoch": 0.21884927638545712,
"grad_norm": 0.4009873867034912,
"learning_rate": 4.94731473413577e-05,
"loss": 8.8088,
"step": 465
},
{
"epoch": 0.21931991999058711,
"grad_norm": 0.3694516122341156,
"learning_rate": 4.946917775028204e-05,
"loss": 9.6886,
"step": 466
},
{
"epoch": 0.21979056359571714,
"grad_norm": 0.4301382899284363,
"learning_rate": 4.946519342122199e-05,
"loss": 8.8388,
"step": 467
},
{
"epoch": 0.22026120720084716,
"grad_norm": 0.3725178837776184,
"learning_rate": 4.946119435657738e-05,
"loss": 9.3083,
"step": 468
},
{
"epoch": 0.22073185080597718,
"grad_norm": 0.34573477506637573,
"learning_rate": 4.945718055875684e-05,
"loss": 9.3972,
"step": 469
},
{
"epoch": 0.22120249441110718,
"grad_norm": 0.4900851845741272,
"learning_rate": 4.945315203017795e-05,
"loss": 8.8847,
"step": 470
},
{
"epoch": 0.2216731380162372,
"grad_norm": 0.3375721871852875,
"learning_rate": 4.944910877326709e-05,
"loss": 9.3369,
"step": 471
},
{
"epoch": 0.22214378162136722,
"grad_norm": 0.38274478912353516,
"learning_rate": 4.944505079045958e-05,
"loss": 9.2587,
"step": 472
},
{
"epoch": 0.22261442522649724,
"grad_norm": 0.45915624499320984,
"learning_rate": 4.944097808419955e-05,
"loss": 8.6162,
"step": 473
},
{
"epoch": 0.22308506883162724,
"grad_norm": 0.4436270296573639,
"learning_rate": 4.9436890656940045e-05,
"loss": 8.9692,
"step": 474
},
{
"epoch": 0.22355571243675726,
"grad_norm": 0.44073861837387085,
"learning_rate": 4.943278851114293e-05,
"loss": 8.6524,
"step": 475
},
{
"epoch": 0.22402635604188728,
"grad_norm": 0.37401431798934937,
"learning_rate": 4.942867164927899e-05,
"loss": 9.3269,
"step": 476
},
{
"epoch": 0.2244969996470173,
"grad_norm": 0.36092767119407654,
"learning_rate": 4.942454007382782e-05,
"loss": 9.0893,
"step": 477
},
{
"epoch": 0.2249676432521473,
"grad_norm": 0.46312302350997925,
"learning_rate": 4.9420393787277917e-05,
"loss": 9.3986,
"step": 478
},
{
"epoch": 0.22543828685727732,
"grad_norm": 0.339429646730423,
"learning_rate": 4.9416232792126615e-05,
"loss": 9.3501,
"step": 479
},
{
"epoch": 0.22590893046240734,
"grad_norm": 0.4019092917442322,
"learning_rate": 4.941205709088011e-05,
"loss": 8.8818,
"step": 480
},
{
"epoch": 0.22637957406753736,
"grad_norm": 0.4025574028491974,
"learning_rate": 4.940786668605348e-05,
"loss": 9.0087,
"step": 481
},
{
"epoch": 0.22685021767266736,
"grad_norm": 0.41925379633903503,
"learning_rate": 4.9403661580170626e-05,
"loss": 9.0019,
"step": 482
},
{
"epoch": 0.22732086127779738,
"grad_norm": 0.38912633061408997,
"learning_rate": 4.939944177576432e-05,
"loss": 9.4554,
"step": 483
},
{
"epoch": 0.2277915048829274,
"grad_norm": 0.3775523602962494,
"learning_rate": 4.9395207275376175e-05,
"loss": 8.911,
"step": 484
},
{
"epoch": 0.22826214848805743,
"grad_norm": 0.37626808881759644,
"learning_rate": 4.939095808155668e-05,
"loss": 8.9951,
"step": 485
},
{
"epoch": 0.22873279209318742,
"grad_norm": 0.4059127867221832,
"learning_rate": 4.938669419686516e-05,
"loss": 9.0841,
"step": 486
},
{
"epoch": 0.22920343569831744,
"grad_norm": 0.35881519317626953,
"learning_rate": 4.938241562386977e-05,
"loss": 9.2341,
"step": 487
},
{
"epoch": 0.22967407930344746,
"grad_norm": 0.42100849747657776,
"learning_rate": 4.9378122365147536e-05,
"loss": 9.0711,
"step": 488
},
{
"epoch": 0.2301447229085775,
"grad_norm": 0.4081602394580841,
"learning_rate": 4.9373814423284336e-05,
"loss": 9.0102,
"step": 489
},
{
"epoch": 0.23061536651370748,
"grad_norm": 0.3893739581108093,
"learning_rate": 4.936949180087486e-05,
"loss": 9.1481,
"step": 490
},
{
"epoch": 0.2310860101188375,
"grad_norm": 0.38784539699554443,
"learning_rate": 4.936515450052267e-05,
"loss": 9.2699,
"step": 491
},
{
"epoch": 0.23155665372396753,
"grad_norm": 0.39232099056243896,
"learning_rate": 4.9360802524840156e-05,
"loss": 9.1015,
"step": 492
},
{
"epoch": 0.23202729732909755,
"grad_norm": 0.4174420237541199,
"learning_rate": 4.935643587644855e-05,
"loss": 8.8689,
"step": 493
},
{
"epoch": 0.23249794093422754,
"grad_norm": 0.3970744013786316,
"learning_rate": 4.9352054557977905e-05,
"loss": 9.134,
"step": 494
},
{
"epoch": 0.23296858453935756,
"grad_norm": 0.34588709473609924,
"learning_rate": 4.934765857206715e-05,
"loss": 9.1163,
"step": 495
},
{
"epoch": 0.2334392281444876,
"grad_norm": 0.38045328855514526,
"learning_rate": 4.934324792136399e-05,
"loss": 9.2736,
"step": 496
},
{
"epoch": 0.2339098717496176,
"grad_norm": 0.3795531094074249,
"learning_rate": 4.9338822608525027e-05,
"loss": 9.2326,
"step": 497
},
{
"epoch": 0.2343805153547476,
"grad_norm": 0.3959232270717621,
"learning_rate": 4.9334382636215646e-05,
"loss": 9.2973,
"step": 498
},
{
"epoch": 0.23485115895987763,
"grad_norm": 0.40320464968681335,
"learning_rate": 4.932992800711009e-05,
"loss": 8.8766,
"step": 499
},
{
"epoch": 0.23532180256500765,
"grad_norm": 0.35472753643989563,
"learning_rate": 4.9325458723891405e-05,
"loss": 9.2191,
"step": 500
},
{
"epoch": 0.23579244617013767,
"grad_norm": 0.40472298860549927,
"learning_rate": 4.932097478925148e-05,
"loss": 8.8783,
"step": 501
},
{
"epoch": 0.23626308977526767,
"grad_norm": 0.4293891489505768,
"learning_rate": 4.931647620589104e-05,
"loss": 8.4516,
"step": 502
},
{
"epoch": 0.2367337333803977,
"grad_norm": 0.3897256851196289,
"learning_rate": 4.9311962976519586e-05,
"loss": 9.2541,
"step": 503
},
{
"epoch": 0.2372043769855277,
"grad_norm": 0.36981016397476196,
"learning_rate": 4.9307435103855507e-05,
"loss": 9.0664,
"step": 504
},
{
"epoch": 0.23767502059065773,
"grad_norm": 0.4339733421802521,
"learning_rate": 4.930289259062596e-05,
"loss": 9.2965,
"step": 505
},
{
"epoch": 0.23814566419578773,
"grad_norm": 0.4204358756542206,
"learning_rate": 4.9298335439566946e-05,
"loss": 9.0738,
"step": 506
},
{
"epoch": 0.23861630780091775,
"grad_norm": 0.3759208023548126,
"learning_rate": 4.929376365342326e-05,
"loss": 9.5119,
"step": 507
},
{
"epoch": 0.23908695140604777,
"grad_norm": 0.3684697151184082,
"learning_rate": 4.9289177234948535e-05,
"loss": 9.338,
"step": 508
},
{
"epoch": 0.2395575950111778,
"grad_norm": 0.40956175327301025,
"learning_rate": 4.928457618690522e-05,
"loss": 9.0164,
"step": 509
},
{
"epoch": 0.2400282386163078,
"grad_norm": 0.4373653829097748,
"learning_rate": 4.927996051206454e-05,
"loss": 8.4385,
"step": 510
},
{
"epoch": 0.2404988822214378,
"grad_norm": 0.3845258951187134,
"learning_rate": 4.927533021320657e-05,
"loss": 9.3247,
"step": 511
},
{
"epoch": 0.24096952582656783,
"grad_norm": 0.3763442039489746,
"learning_rate": 4.9270685293120164e-05,
"loss": 9.357,
"step": 512
},
{
"epoch": 0.24144016943169785,
"grad_norm": 0.4450169503688812,
"learning_rate": 4.9266025754603005e-05,
"loss": 8.5107,
"step": 513
},
{
"epoch": 0.24191081303682785,
"grad_norm": 0.41103556752204895,
"learning_rate": 4.926135160046157e-05,
"loss": 9.3063,
"step": 514
},
{
"epoch": 0.24238145664195787,
"grad_norm": 0.4856661856174469,
"learning_rate": 4.925666283351114e-05,
"loss": 8.7831,
"step": 515
},
{
"epoch": 0.2428521002470879,
"grad_norm": 0.3764643371105194,
"learning_rate": 4.92519594565758e-05,
"loss": 9.0384,
"step": 516
},
{
"epoch": 0.24332274385221792,
"grad_norm": 0.3988141417503357,
"learning_rate": 4.924724147248841e-05,
"loss": 9.1045,
"step": 517
},
{
"epoch": 0.2437933874573479,
"grad_norm": 0.3450901210308075,
"learning_rate": 4.924250888409069e-05,
"loss": 9.3091,
"step": 518
},
{
"epoch": 0.24426403106247793,
"grad_norm": 0.4347275495529175,
"learning_rate": 4.923776169423309e-05,
"loss": 9.115,
"step": 519
},
{
"epoch": 0.24473467466760795,
"grad_norm": 0.36428380012512207,
"learning_rate": 4.923299990577488e-05,
"loss": 9.0645,
"step": 520
},
{
"epoch": 0.24520531827273798,
"grad_norm": 0.4311101734638214,
"learning_rate": 4.922822352158412e-05,
"loss": 8.7247,
"step": 521
},
{
"epoch": 0.24567596187786797,
"grad_norm": 0.4824456572532654,
"learning_rate": 4.922343254453768e-05,
"loss": 8.7448,
"step": 522
},
{
"epoch": 0.246146605482998,
"grad_norm": 0.5465502738952637,
"learning_rate": 4.9218626977521206e-05,
"loss": 8.471,
"step": 523
},
{
"epoch": 0.24661724908812802,
"grad_norm": 0.4191696047782898,
"learning_rate": 4.921380682342912e-05,
"loss": 8.4572,
"step": 524
},
{
"epoch": 0.24708789269325804,
"grad_norm": 0.40454065799713135,
"learning_rate": 4.920897208516464e-05,
"loss": 9.3254,
"step": 525
},
{
"epoch": 0.24755853629838803,
"grad_norm": 0.36362919211387634,
"learning_rate": 4.920412276563977e-05,
"loss": 9.4725,
"step": 526
},
{
"epoch": 0.24802917990351805,
"grad_norm": 0.38239118456840515,
"learning_rate": 4.91992588677753e-05,
"loss": 8.5503,
"step": 527
},
{
"epoch": 0.24849982350864808,
"grad_norm": 0.3423115015029907,
"learning_rate": 4.919438039450078e-05,
"loss": 9.294,
"step": 528
},
{
"epoch": 0.2489704671137781,
"grad_norm": 0.3812299966812134,
"learning_rate": 4.918948734875457e-05,
"loss": 9.374,
"step": 529
},
{
"epoch": 0.2494411107189081,
"grad_norm": 0.5085097551345825,
"learning_rate": 4.9184579733483796e-05,
"loss": 8.5979,
"step": 530
},
{
"epoch": 0.24991175432403812,
"grad_norm": 0.34993723034858704,
"learning_rate": 4.917965755164433e-05,
"loss": 9.4077,
"step": 531
}
],
"logging_steps": 1,
"max_steps": 4248,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 531,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.093280422836306e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}