dag_qwen_sft_v0 / trainer_state.json
aaabiao
add model
1c46159
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9937952430196484,
"eval_steps": 500,
"global_step": 482,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004136504653567736,
"grad_norm": 8.454320907592773,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.4689,
"step": 1
},
{
"epoch": 0.008273009307135471,
"grad_norm": 8.755942344665527,
"learning_rate": 8.000000000000001e-07,
"loss": 0.4625,
"step": 2
},
{
"epoch": 0.012409513960703205,
"grad_norm": 13.382512092590332,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.4319,
"step": 3
},
{
"epoch": 0.016546018614270942,
"grad_norm": 11.072649955749512,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.471,
"step": 4
},
{
"epoch": 0.020682523267838676,
"grad_norm": 4.4571709632873535,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.4341,
"step": 5
},
{
"epoch": 0.02481902792140641,
"grad_norm": 4.237286567687988,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.4637,
"step": 6
},
{
"epoch": 0.028955532574974147,
"grad_norm": 3.21901535987854,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.4598,
"step": 7
},
{
"epoch": 0.033092037228541885,
"grad_norm": 2.7905218601226807,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.4174,
"step": 8
},
{
"epoch": 0.03722854188210962,
"grad_norm": 2.5547449588775635,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.4488,
"step": 9
},
{
"epoch": 0.04136504653567735,
"grad_norm": 2.075817584991455,
"learning_rate": 4.000000000000001e-06,
"loss": 0.4323,
"step": 10
},
{
"epoch": 0.045501551189245086,
"grad_norm": 1.1852331161499023,
"learning_rate": 4.4e-06,
"loss": 0.404,
"step": 11
},
{
"epoch": 0.04963805584281282,
"grad_norm": 1.370549201965332,
"learning_rate": 4.800000000000001e-06,
"loss": 0.3687,
"step": 12
},
{
"epoch": 0.05377456049638056,
"grad_norm": 0.7430139780044556,
"learning_rate": 5.2e-06,
"loss": 0.3812,
"step": 13
},
{
"epoch": 0.057911065149948295,
"grad_norm": 0.8032243251800537,
"learning_rate": 5.600000000000001e-06,
"loss": 0.373,
"step": 14
},
{
"epoch": 0.06204756980351603,
"grad_norm": 2.7111802101135254,
"learning_rate": 6e-06,
"loss": 0.371,
"step": 15
},
{
"epoch": 0.06618407445708377,
"grad_norm": 0.8430923819541931,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.3891,
"step": 16
},
{
"epoch": 0.0703205791106515,
"grad_norm": 0.6954956650733948,
"learning_rate": 6.800000000000001e-06,
"loss": 0.376,
"step": 17
},
{
"epoch": 0.07445708376421924,
"grad_norm": 0.7058322429656982,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.3958,
"step": 18
},
{
"epoch": 0.07859358841778696,
"grad_norm": 0.5975633859634399,
"learning_rate": 7.600000000000001e-06,
"loss": 0.3674,
"step": 19
},
{
"epoch": 0.0827300930713547,
"grad_norm": 0.6905612945556641,
"learning_rate": 8.000000000000001e-06,
"loss": 0.3925,
"step": 20
},
{
"epoch": 0.08686659772492245,
"grad_norm": 0.6662179827690125,
"learning_rate": 8.400000000000001e-06,
"loss": 0.3837,
"step": 21
},
{
"epoch": 0.09100310237849017,
"grad_norm": 0.9616004824638367,
"learning_rate": 8.8e-06,
"loss": 0.3805,
"step": 22
},
{
"epoch": 0.09513960703205791,
"grad_norm": 1.6762669086456299,
"learning_rate": 9.200000000000002e-06,
"loss": 0.3661,
"step": 23
},
{
"epoch": 0.09927611168562564,
"grad_norm": 3.642876148223877,
"learning_rate": 9.600000000000001e-06,
"loss": 0.3723,
"step": 24
},
{
"epoch": 0.10341261633919338,
"grad_norm": 22.331893920898438,
"learning_rate": 1e-05,
"loss": 0.4012,
"step": 25
},
{
"epoch": 0.10754912099276112,
"grad_norm": 4.078958034515381,
"learning_rate": 9.999881857639567e-06,
"loss": 0.4019,
"step": 26
},
{
"epoch": 0.11168562564632885,
"grad_norm": 2.163355827331543,
"learning_rate": 9.999527436141312e-06,
"loss": 0.4275,
"step": 27
},
{
"epoch": 0.11582213029989659,
"grad_norm": 1.0988469123840332,
"learning_rate": 9.998936752254111e-06,
"loss": 0.3885,
"step": 28
},
{
"epoch": 0.11995863495346432,
"grad_norm": 1.299137830734253,
"learning_rate": 9.998109833891883e-06,
"loss": 0.388,
"step": 29
},
{
"epoch": 0.12409513960703206,
"grad_norm": 5.283950328826904,
"learning_rate": 9.997046720132262e-06,
"loss": 0.4219,
"step": 30
},
{
"epoch": 0.1282316442605998,
"grad_norm": 0.8062543869018555,
"learning_rate": 9.995747461214752e-06,
"loss": 0.3589,
"step": 31
},
{
"epoch": 0.13236814891416754,
"grad_norm": 0.7722073793411255,
"learning_rate": 9.994212118538364e-06,
"loss": 0.3486,
"step": 32
},
{
"epoch": 0.13650465356773525,
"grad_norm": 0.762801468372345,
"learning_rate": 9.992440764658697e-06,
"loss": 0.3676,
"step": 33
},
{
"epoch": 0.140641158221303,
"grad_norm": 0.745180606842041,
"learning_rate": 9.990433483284527e-06,
"loss": 0.4115,
"step": 34
},
{
"epoch": 0.14477766287487073,
"grad_norm": 0.8027282953262329,
"learning_rate": 9.988190369273834e-06,
"loss": 0.4001,
"step": 35
},
{
"epoch": 0.14891416752843847,
"grad_norm": 0.6682556867599487,
"learning_rate": 9.985711528629332e-06,
"loss": 0.3637,
"step": 36
},
{
"epoch": 0.15305067218200621,
"grad_norm": 0.6948946714401245,
"learning_rate": 9.982997078493457e-06,
"loss": 0.3488,
"step": 37
},
{
"epoch": 0.15718717683557393,
"grad_norm": 0.7381304502487183,
"learning_rate": 9.980047147142824e-06,
"loss": 0.3777,
"step": 38
},
{
"epoch": 0.16132368148914167,
"grad_norm": 0.6775998473167419,
"learning_rate": 9.976861873982177e-06,
"loss": 0.3904,
"step": 39
},
{
"epoch": 0.1654601861427094,
"grad_norm": 0.8822210431098938,
"learning_rate": 9.973441409537795e-06,
"loss": 0.383,
"step": 40
},
{
"epoch": 0.16959669079627715,
"grad_norm": 0.657383382320404,
"learning_rate": 9.969785915450368e-06,
"loss": 0.3882,
"step": 41
},
{
"epoch": 0.1737331954498449,
"grad_norm": 0.6214635372161865,
"learning_rate": 9.965895564467381e-06,
"loss": 0.3922,
"step": 42
},
{
"epoch": 0.1778697001034126,
"grad_norm": 0.6440220475196838,
"learning_rate": 9.961770540434931e-06,
"loss": 0.3796,
"step": 43
},
{
"epoch": 0.18200620475698034,
"grad_norm": 0.6130467653274536,
"learning_rate": 9.95741103828905e-06,
"loss": 0.3562,
"step": 44
},
{
"epoch": 0.18614270941054809,
"grad_norm": 0.6214406490325928,
"learning_rate": 9.952817264046486e-06,
"loss": 0.396,
"step": 45
},
{
"epoch": 0.19027921406411583,
"grad_norm": 0.5837990641593933,
"learning_rate": 9.947989434794973e-06,
"loss": 0.3455,
"step": 46
},
{
"epoch": 0.19441571871768357,
"grad_norm": 0.6432331800460815,
"learning_rate": 9.942927778682968e-06,
"loss": 0.3791,
"step": 47
},
{
"epoch": 0.19855222337125128,
"grad_norm": 0.6208926439285278,
"learning_rate": 9.937632534908872e-06,
"loss": 0.4059,
"step": 48
},
{
"epoch": 0.20268872802481902,
"grad_norm": 0.6121624112129211,
"learning_rate": 9.932103953709724e-06,
"loss": 0.3693,
"step": 49
},
{
"epoch": 0.20682523267838676,
"grad_norm": 0.5415109395980835,
"learning_rate": 9.926342296349378e-06,
"loss": 0.3192,
"step": 50
},
{
"epoch": 0.2109617373319545,
"grad_norm": 0.5551713109016418,
"learning_rate": 9.920347835106152e-06,
"loss": 0.3563,
"step": 51
},
{
"epoch": 0.21509824198552224,
"grad_norm": 0.6338883638381958,
"learning_rate": 9.914120853259968e-06,
"loss": 0.3917,
"step": 52
},
{
"epoch": 0.21923474663908996,
"grad_norm": 0.6104925274848938,
"learning_rate": 9.90766164507896e-06,
"loss": 0.3983,
"step": 53
},
{
"epoch": 0.2233712512926577,
"grad_norm": 0.592183530330658,
"learning_rate": 9.900970515805564e-06,
"loss": 0.341,
"step": 54
},
{
"epoch": 0.22750775594622544,
"grad_norm": 0.513282060623169,
"learning_rate": 9.89404778164211e-06,
"loss": 0.3581,
"step": 55
},
{
"epoch": 0.23164426059979318,
"grad_norm": 0.5831045508384705,
"learning_rate": 9.886893769735852e-06,
"loss": 0.3561,
"step": 56
},
{
"epoch": 0.23578076525336092,
"grad_norm": 0.5578728914260864,
"learning_rate": 9.879508818163536e-06,
"loss": 0.3615,
"step": 57
},
{
"epoch": 0.23991726990692863,
"grad_norm": 0.6278296709060669,
"learning_rate": 9.871893275915408e-06,
"loss": 0.3675,
"step": 58
},
{
"epoch": 0.24405377456049637,
"grad_norm": 0.7174540758132935,
"learning_rate": 9.864047502878717e-06,
"loss": 0.3633,
"step": 59
},
{
"epoch": 0.2481902792140641,
"grad_norm": 0.5807657837867737,
"learning_rate": 9.855971869820726e-06,
"loss": 0.3567,
"step": 60
},
{
"epoch": 0.25232678386763185,
"grad_norm": 0.5903241038322449,
"learning_rate": 9.847666758371175e-06,
"loss": 0.3864,
"step": 61
},
{
"epoch": 0.2564632885211996,
"grad_norm": 0.5542154312133789,
"learning_rate": 9.83913256100425e-06,
"loss": 0.3763,
"step": 62
},
{
"epoch": 0.26059979317476734,
"grad_norm": 0.6011348962783813,
"learning_rate": 9.830369681020043e-06,
"loss": 0.363,
"step": 63
},
{
"epoch": 0.2647362978283351,
"grad_norm": 0.5147583484649658,
"learning_rate": 9.821378532525479e-06,
"loss": 0.3634,
"step": 64
},
{
"epoch": 0.2688728024819028,
"grad_norm": 0.5299031734466553,
"learning_rate": 9.812159540414766e-06,
"loss": 0.3703,
"step": 65
},
{
"epoch": 0.2730093071354705,
"grad_norm": 0.525841474533081,
"learning_rate": 9.802713140349294e-06,
"loss": 0.3592,
"step": 66
},
{
"epoch": 0.27714581178903824,
"grad_norm": 0.47922268509864807,
"learning_rate": 9.79303977873707e-06,
"loss": 0.3484,
"step": 67
},
{
"epoch": 0.281282316442606,
"grad_norm": 0.5722537636756897,
"learning_rate": 9.783139912711597e-06,
"loss": 0.3435,
"step": 68
},
{
"epoch": 0.2854188210961737,
"grad_norm": 0.603398859500885,
"learning_rate": 9.773014010110298e-06,
"loss": 0.3995,
"step": 69
},
{
"epoch": 0.28955532574974147,
"grad_norm": 0.529694139957428,
"learning_rate": 9.76266254945238e-06,
"loss": 0.3934,
"step": 70
},
{
"epoch": 0.2936918304033092,
"grad_norm": 0.5173729062080383,
"learning_rate": 9.752086019916246e-06,
"loss": 0.3618,
"step": 71
},
{
"epoch": 0.29782833505687695,
"grad_norm": 0.5737492442131042,
"learning_rate": 9.74128492131636e-06,
"loss": 0.377,
"step": 72
},
{
"epoch": 0.3019648397104447,
"grad_norm": 0.6340614557266235,
"learning_rate": 9.730259764079636e-06,
"loss": 0.3887,
"step": 73
},
{
"epoch": 0.30610134436401243,
"grad_norm": 0.5502659678459167,
"learning_rate": 9.719011069221316e-06,
"loss": 0.3749,
"step": 74
},
{
"epoch": 0.31023784901758017,
"grad_norm": 0.48107632994651794,
"learning_rate": 9.70753936832034e-06,
"loss": 0.3445,
"step": 75
},
{
"epoch": 0.31437435367114785,
"grad_norm": 0.47837021946907043,
"learning_rate": 9.695845203494242e-06,
"loss": 0.3566,
"step": 76
},
{
"epoch": 0.3185108583247156,
"grad_norm": 0.5170641541481018,
"learning_rate": 9.683929127373514e-06,
"loss": 0.3878,
"step": 77
},
{
"epoch": 0.32264736297828334,
"grad_norm": 0.5370326638221741,
"learning_rate": 9.671791703075502e-06,
"loss": 0.3545,
"step": 78
},
{
"epoch": 0.3267838676318511,
"grad_norm": 0.5362874865531921,
"learning_rate": 9.659433504177786e-06,
"loss": 0.3947,
"step": 79
},
{
"epoch": 0.3309203722854188,
"grad_norm": 0.5446822047233582,
"learning_rate": 9.646855114691081e-06,
"loss": 0.3777,
"step": 80
},
{
"epoch": 0.33505687693898656,
"grad_norm": 0.5000081658363342,
"learning_rate": 9.63405712903164e-06,
"loss": 0.3713,
"step": 81
},
{
"epoch": 0.3391933815925543,
"grad_norm": 0.4431915581226349,
"learning_rate": 9.621040151993153e-06,
"loss": 0.3508,
"step": 82
},
{
"epoch": 0.34332988624612204,
"grad_norm": 0.51210618019104,
"learning_rate": 9.607804798718182e-06,
"loss": 0.3702,
"step": 83
},
{
"epoch": 0.3474663908996898,
"grad_norm": 0.49018731713294983,
"learning_rate": 9.59435169466907e-06,
"loss": 0.3796,
"step": 84
},
{
"epoch": 0.3516028955532575,
"grad_norm": 0.5700220465660095,
"learning_rate": 9.580681475598413e-06,
"loss": 0.3882,
"step": 85
},
{
"epoch": 0.3557394002068252,
"grad_norm": 0.48753252625465393,
"learning_rate": 9.566794787518986e-06,
"loss": 0.3773,
"step": 86
},
{
"epoch": 0.35987590486039295,
"grad_norm": 0.47645437717437744,
"learning_rate": 9.552692286673231e-06,
"loss": 0.3478,
"step": 87
},
{
"epoch": 0.3640124095139607,
"grad_norm": 0.4645499587059021,
"learning_rate": 9.538374639502247e-06,
"loss": 0.3523,
"step": 88
},
{
"epoch": 0.36814891416752843,
"grad_norm": 0.4936198890209198,
"learning_rate": 9.523842522614285e-06,
"loss": 0.3233,
"step": 89
},
{
"epoch": 0.37228541882109617,
"grad_norm": 0.47896862030029297,
"learning_rate": 9.509096622752781e-06,
"loss": 0.3583,
"step": 90
},
{
"epoch": 0.3764219234746639,
"grad_norm": 0.4804452955722809,
"learning_rate": 9.4941376367639e-06,
"loss": 0.3441,
"step": 91
},
{
"epoch": 0.38055842812823165,
"grad_norm": 0.47014203667640686,
"learning_rate": 9.478966271563614e-06,
"loss": 0.3406,
"step": 92
},
{
"epoch": 0.3846949327817994,
"grad_norm": 0.5452392101287842,
"learning_rate": 9.463583244104274e-06,
"loss": 0.3658,
"step": 93
},
{
"epoch": 0.38883143743536713,
"grad_norm": 0.49594131112098694,
"learning_rate": 9.447989281340753e-06,
"loss": 0.3644,
"step": 94
},
{
"epoch": 0.3929679420889349,
"grad_norm": 0.48177802562713623,
"learning_rate": 9.43218512019608e-06,
"loss": 0.364,
"step": 95
},
{
"epoch": 0.39710444674250256,
"grad_norm": 0.4789188504219055,
"learning_rate": 9.416171507526615e-06,
"loss": 0.3724,
"step": 96
},
{
"epoch": 0.4012409513960703,
"grad_norm": 0.5925107598304749,
"learning_rate": 9.399949200086757e-06,
"loss": 0.3799,
"step": 97
},
{
"epoch": 0.40537745604963804,
"grad_norm": 0.540553092956543,
"learning_rate": 9.383518964493183e-06,
"loss": 0.3913,
"step": 98
},
{
"epoch": 0.4095139607032058,
"grad_norm": 0.5033954977989197,
"learning_rate": 9.36688157718862e-06,
"loss": 0.3882,
"step": 99
},
{
"epoch": 0.4136504653567735,
"grad_norm": 0.4835229218006134,
"learning_rate": 9.350037824405151e-06,
"loss": 0.357,
"step": 100
},
{
"epoch": 0.41778697001034126,
"grad_norm": 0.5028110146522522,
"learning_rate": 9.332988502127063e-06,
"loss": 0.3395,
"step": 101
},
{
"epoch": 0.421923474663909,
"grad_norm": 0.6103828549385071,
"learning_rate": 9.315734416053223e-06,
"loss": 0.3832,
"step": 102
},
{
"epoch": 0.42605997931747674,
"grad_norm": 0.4925767481327057,
"learning_rate": 9.298276381559015e-06,
"loss": 0.3414,
"step": 103
},
{
"epoch": 0.4301964839710445,
"grad_norm": 0.5328059792518616,
"learning_rate": 9.280615223657801e-06,
"loss": 0.3887,
"step": 104
},
{
"epoch": 0.4343329886246122,
"grad_norm": 0.5046906471252441,
"learning_rate": 9.262751776961936e-06,
"loss": 0.3608,
"step": 105
},
{
"epoch": 0.4384694932781799,
"grad_norm": 0.4689864218235016,
"learning_rate": 9.24468688564332e-06,
"loss": 0.3734,
"step": 106
},
{
"epoch": 0.44260599793174765,
"grad_norm": 0.46193334460258484,
"learning_rate": 9.226421403393513e-06,
"loss": 0.3557,
"step": 107
},
{
"epoch": 0.4467425025853154,
"grad_norm": 0.518205463886261,
"learning_rate": 9.207956193383392e-06,
"loss": 0.3293,
"step": 108
},
{
"epoch": 0.45087900723888313,
"grad_norm": 0.5061272978782654,
"learning_rate": 9.189292128222355e-06,
"loss": 0.3477,
"step": 109
},
{
"epoch": 0.4550155118924509,
"grad_norm": 0.46607810258865356,
"learning_rate": 9.170430089917089e-06,
"loss": 0.3978,
"step": 110
},
{
"epoch": 0.4591520165460186,
"grad_norm": 0.4538101851940155,
"learning_rate": 9.151370969829883e-06,
"loss": 0.3525,
"step": 111
},
{
"epoch": 0.46328852119958636,
"grad_norm": 0.4456521272659302,
"learning_rate": 9.132115668636512e-06,
"loss": 0.3575,
"step": 112
},
{
"epoch": 0.4674250258531541,
"grad_norm": 0.5219409465789795,
"learning_rate": 9.112665096283668e-06,
"loss": 0.3703,
"step": 113
},
{
"epoch": 0.47156153050672184,
"grad_norm": 0.5195448398590088,
"learning_rate": 9.093020171945966e-06,
"loss": 0.3651,
"step": 114
},
{
"epoch": 0.4756980351602896,
"grad_norm": 0.5239256620407104,
"learning_rate": 9.073181823982495e-06,
"loss": 0.3555,
"step": 115
},
{
"epoch": 0.47983453981385726,
"grad_norm": 0.4262794852256775,
"learning_rate": 9.05315098989296e-06,
"loss": 0.3303,
"step": 116
},
{
"epoch": 0.483971044467425,
"grad_norm": 0.4619412422180176,
"learning_rate": 9.032928616273369e-06,
"loss": 0.3612,
"step": 117
},
{
"epoch": 0.48810754912099275,
"grad_norm": 0.468650758266449,
"learning_rate": 9.012515658771301e-06,
"loss": 0.3725,
"step": 118
},
{
"epoch": 0.4922440537745605,
"grad_norm": 0.4874132573604584,
"learning_rate": 8.991913082040752e-06,
"loss": 0.3671,
"step": 119
},
{
"epoch": 0.4963805584281282,
"grad_norm": 0.48114946484565735,
"learning_rate": 8.971121859696539e-06,
"loss": 0.3603,
"step": 120
},
{
"epoch": 0.500517063081696,
"grad_norm": 0.5342724919319153,
"learning_rate": 8.950142974268295e-06,
"loss": 0.3561,
"step": 121
},
{
"epoch": 0.5046535677352637,
"grad_norm": 0.5296602845191956,
"learning_rate": 8.928977417154037e-06,
"loss": 0.3552,
"step": 122
},
{
"epoch": 0.5087900723888314,
"grad_norm": 0.47604137659072876,
"learning_rate": 8.907626188573319e-06,
"loss": 0.3751,
"step": 123
},
{
"epoch": 0.5129265770423992,
"grad_norm": 0.544127345085144,
"learning_rate": 8.886090297519956e-06,
"loss": 0.39,
"step": 124
},
{
"epoch": 0.5170630816959669,
"grad_norm": 0.495714396238327,
"learning_rate": 8.864370761714348e-06,
"loss": 0.3764,
"step": 125
},
{
"epoch": 0.5211995863495347,
"grad_norm": 0.45246466994285583,
"learning_rate": 8.842468607555389e-06,
"loss": 0.3273,
"step": 126
},
{
"epoch": 0.5253360910031024,
"grad_norm": 0.46964627504348755,
"learning_rate": 8.820384870071951e-06,
"loss": 0.3712,
"step": 127
},
{
"epoch": 0.5294725956566702,
"grad_norm": 0.5150438547134399,
"learning_rate": 8.79812059287399e-06,
"loss": 0.3676,
"step": 128
},
{
"epoch": 0.5336091003102379,
"grad_norm": 0.48608115315437317,
"learning_rate": 8.775676828103205e-06,
"loss": 0.3862,
"step": 129
},
{
"epoch": 0.5377456049638056,
"grad_norm": 0.5238416790962219,
"learning_rate": 8.753054636383336e-06,
"loss": 0.3927,
"step": 130
},
{
"epoch": 0.5418821096173733,
"grad_norm": 0.4756030738353729,
"learning_rate": 8.730255086770037e-06,
"loss": 0.3429,
"step": 131
},
{
"epoch": 0.546018614270941,
"grad_norm": 0.46515801548957825,
"learning_rate": 8.707279256700348e-06,
"loss": 0.3367,
"step": 132
},
{
"epoch": 0.5501551189245087,
"grad_norm": 0.5517006516456604,
"learning_rate": 8.684128231941789e-06,
"loss": 0.3688,
"step": 133
},
{
"epoch": 0.5542916235780765,
"grad_norm": 0.5072327852249146,
"learning_rate": 8.660803106541044e-06,
"loss": 0.3224,
"step": 134
},
{
"epoch": 0.5584281282316442,
"grad_norm": 0.4414540231227875,
"learning_rate": 8.637304982772263e-06,
"loss": 0.3166,
"step": 135
},
{
"epoch": 0.562564632885212,
"grad_norm": 0.5401909351348877,
"learning_rate": 8.613634971084967e-06,
"loss": 0.3697,
"step": 136
},
{
"epoch": 0.5667011375387797,
"grad_norm": 0.502416729927063,
"learning_rate": 8.589794190051582e-06,
"loss": 0.3647,
"step": 137
},
{
"epoch": 0.5708376421923474,
"grad_norm": 0.49979519844055176,
"learning_rate": 8.56578376631456e-06,
"loss": 0.3542,
"step": 138
},
{
"epoch": 0.5749741468459152,
"grad_norm": 0.4783455431461334,
"learning_rate": 8.541604834533159e-06,
"loss": 0.3577,
"step": 139
},
{
"epoch": 0.5791106514994829,
"grad_norm": 0.4866260886192322,
"learning_rate": 8.51725853732981e-06,
"loss": 0.3567,
"step": 140
},
{
"epoch": 0.5832471561530507,
"grad_norm": 0.480307012796402,
"learning_rate": 8.492746025236113e-06,
"loss": 0.335,
"step": 141
},
{
"epoch": 0.5873836608066184,
"grad_norm": 0.4932575821876526,
"learning_rate": 8.468068456638491e-06,
"loss": 0.3411,
"step": 142
},
{
"epoch": 0.5915201654601862,
"grad_norm": 0.49242812395095825,
"learning_rate": 8.443226997723426e-06,
"loss": 0.3589,
"step": 143
},
{
"epoch": 0.5956566701137539,
"grad_norm": 0.5170210599899292,
"learning_rate": 8.418222822422348e-06,
"loss": 0.385,
"step": 144
},
{
"epoch": 0.5997931747673216,
"grad_norm": 0.45948079228401184,
"learning_rate": 8.393057112356181e-06,
"loss": 0.3502,
"step": 145
},
{
"epoch": 0.6039296794208894,
"grad_norm": 0.47525444626808167,
"learning_rate": 8.367731056779476e-06,
"loss": 0.3387,
"step": 146
},
{
"epoch": 0.6080661840744571,
"grad_norm": 0.4996655583381653,
"learning_rate": 8.342245852524229e-06,
"loss": 0.3243,
"step": 147
},
{
"epoch": 0.6122026887280249,
"grad_norm": 0.4717055559158325,
"learning_rate": 8.316602703943315e-06,
"loss": 0.3696,
"step": 148
},
{
"epoch": 0.6163391933815926,
"grad_norm": 0.5229761600494385,
"learning_rate": 8.290802822853576e-06,
"loss": 0.4026,
"step": 149
},
{
"epoch": 0.6204756980351603,
"grad_norm": 0.4920945465564728,
"learning_rate": 8.26484742847855e-06,
"loss": 0.3555,
"step": 150
},
{
"epoch": 0.6246122026887281,
"grad_norm": 0.416532963514328,
"learning_rate": 8.238737747390859e-06,
"loss": 0.3145,
"step": 151
},
{
"epoch": 0.6287487073422957,
"grad_norm": 0.4948025941848755,
"learning_rate": 8.212475013454249e-06,
"loss": 0.3603,
"step": 152
},
{
"epoch": 0.6328852119958635,
"grad_norm": 0.4692654013633728,
"learning_rate": 8.186060467765268e-06,
"loss": 0.3541,
"step": 153
},
{
"epoch": 0.6370217166494312,
"grad_norm": 0.4930100440979004,
"learning_rate": 8.159495358594627e-06,
"loss": 0.328,
"step": 154
},
{
"epoch": 0.6411582213029989,
"grad_norm": 0.46493637561798096,
"learning_rate": 8.13278094132821e-06,
"loss": 0.3514,
"step": 155
},
{
"epoch": 0.6452947259565667,
"grad_norm": 0.5059131383895874,
"learning_rate": 8.10591847840774e-06,
"loss": 0.3522,
"step": 156
},
{
"epoch": 0.6494312306101344,
"grad_norm": 0.5415008664131165,
"learning_rate": 8.078909239271127e-06,
"loss": 0.345,
"step": 157
},
{
"epoch": 0.6535677352637022,
"grad_norm": 0.49019861221313477,
"learning_rate": 8.051754500292479e-06,
"loss": 0.3526,
"step": 158
},
{
"epoch": 0.6577042399172699,
"grad_norm": 0.4391830563545227,
"learning_rate": 8.024455544721778e-06,
"loss": 0.3368,
"step": 159
},
{
"epoch": 0.6618407445708376,
"grad_norm": 0.5758143663406372,
"learning_rate": 7.997013662624246e-06,
"loss": 0.3606,
"step": 160
},
{
"epoch": 0.6659772492244054,
"grad_norm": 0.46434131264686584,
"learning_rate": 7.969430150819372e-06,
"loss": 0.3263,
"step": 161
},
{
"epoch": 0.6701137538779731,
"grad_norm": 0.5234054923057556,
"learning_rate": 7.941706312819632e-06,
"loss": 0.3635,
"step": 162
},
{
"epoch": 0.6742502585315409,
"grad_norm": 0.46102845668792725,
"learning_rate": 7.913843458768892e-06,
"loss": 0.3487,
"step": 163
},
{
"epoch": 0.6783867631851086,
"grad_norm": 0.505272388458252,
"learning_rate": 7.88584290538049e-06,
"loss": 0.3687,
"step": 164
},
{
"epoch": 0.6825232678386763,
"grad_norm": 0.5247129797935486,
"learning_rate": 7.857705975875015e-06,
"loss": 0.3575,
"step": 165
},
{
"epoch": 0.6866597724922441,
"grad_norm": 0.4995476007461548,
"learning_rate": 7.829433999917773e-06,
"loss": 0.3583,
"step": 166
},
{
"epoch": 0.6907962771458118,
"grad_norm": 0.478943407535553,
"learning_rate": 7.801028313555954e-06,
"loss": 0.3539,
"step": 167
},
{
"epoch": 0.6949327817993796,
"grad_norm": 0.4750828146934509,
"learning_rate": 7.772490259155493e-06,
"loss": 0.3317,
"step": 168
},
{
"epoch": 0.6990692864529473,
"grad_norm": 0.4754940867424011,
"learning_rate": 7.743821185337634e-06,
"loss": 0.3209,
"step": 169
},
{
"epoch": 0.703205791106515,
"grad_norm": 0.4950121343135834,
"learning_rate": 7.715022446915195e-06,
"loss": 0.3341,
"step": 170
},
{
"epoch": 0.7073422957600828,
"grad_norm": 0.48745468258857727,
"learning_rate": 7.686095404828552e-06,
"loss": 0.3602,
"step": 171
},
{
"epoch": 0.7114788004136504,
"grad_norm": 0.48764947056770325,
"learning_rate": 7.65704142608132e-06,
"loss": 0.3624,
"step": 172
},
{
"epoch": 0.7156153050672182,
"grad_norm": 0.5114070773124695,
"learning_rate": 7.627861883675748e-06,
"loss": 0.3449,
"step": 173
},
{
"epoch": 0.7197518097207859,
"grad_norm": 0.4847152829170227,
"learning_rate": 7.598558156547842e-06,
"loss": 0.3318,
"step": 174
},
{
"epoch": 0.7238883143743536,
"grad_norm": 0.5162774920463562,
"learning_rate": 7.569131629502201e-06,
"loss": 0.3539,
"step": 175
},
{
"epoch": 0.7280248190279214,
"grad_norm": 0.49352213740348816,
"learning_rate": 7.53958369314657e-06,
"loss": 0.3504,
"step": 176
},
{
"epoch": 0.7321613236814891,
"grad_norm": 0.4514661133289337,
"learning_rate": 7.509915743826128e-06,
"loss": 0.3602,
"step": 177
},
{
"epoch": 0.7362978283350569,
"grad_norm": 0.5056818127632141,
"learning_rate": 7.480129183557499e-06,
"loss": 0.3511,
"step": 178
},
{
"epoch": 0.7404343329886246,
"grad_norm": 0.5009995102882385,
"learning_rate": 7.450225419962498e-06,
"loss": 0.3299,
"step": 179
},
{
"epoch": 0.7445708376421923,
"grad_norm": 0.5529451966285706,
"learning_rate": 7.4202058662016155e-06,
"loss": 0.3605,
"step": 180
},
{
"epoch": 0.7487073422957601,
"grad_norm": 0.5108004212379456,
"learning_rate": 7.390071940907222e-06,
"loss": 0.3497,
"step": 181
},
{
"epoch": 0.7528438469493278,
"grad_norm": 0.45150938630104065,
"learning_rate": 7.3598250681165485e-06,
"loss": 0.347,
"step": 182
},
{
"epoch": 0.7569803516028956,
"grad_norm": 0.49005162715911865,
"learning_rate": 7.329466677204371e-06,
"loss": 0.3485,
"step": 183
},
{
"epoch": 0.7611168562564633,
"grad_norm": 0.4927361011505127,
"learning_rate": 7.298998202815474e-06,
"loss": 0.3432,
"step": 184
},
{
"epoch": 0.765253360910031,
"grad_norm": 0.48336061835289,
"learning_rate": 7.268421084796852e-06,
"loss": 0.3443,
"step": 185
},
{
"epoch": 0.7693898655635988,
"grad_norm": 0.48736652731895447,
"learning_rate": 7.237736768129663e-06,
"loss": 0.3418,
"step": 186
},
{
"epoch": 0.7735263702171665,
"grad_norm": 0.4602266252040863,
"learning_rate": 7.206946702860948e-06,
"loss": 0.3322,
"step": 187
},
{
"epoch": 0.7776628748707343,
"grad_norm": 0.4475662410259247,
"learning_rate": 7.176052344035101e-06,
"loss": 0.3519,
"step": 188
},
{
"epoch": 0.781799379524302,
"grad_norm": 0.46669697761535645,
"learning_rate": 7.145055151625113e-06,
"loss": 0.3623,
"step": 189
},
{
"epoch": 0.7859358841778697,
"grad_norm": 0.4729274809360504,
"learning_rate": 7.1139565904635755e-06,
"loss": 0.3517,
"step": 190
},
{
"epoch": 0.7900723888314375,
"grad_norm": 0.49703437089920044,
"learning_rate": 7.082758130173456e-06,
"loss": 0.3732,
"step": 191
},
{
"epoch": 0.7942088934850051,
"grad_norm": 0.5119916200637817,
"learning_rate": 7.051461245098654e-06,
"loss": 0.3421,
"step": 192
},
{
"epoch": 0.7983453981385729,
"grad_norm": 0.4503278434276581,
"learning_rate": 7.020067414234315e-06,
"loss": 0.3342,
"step": 193
},
{
"epoch": 0.8024819027921406,
"grad_norm": 0.46572044491767883,
"learning_rate": 6.988578121156956e-06,
"loss": 0.3314,
"step": 194
},
{
"epoch": 0.8066184074457083,
"grad_norm": 0.49221017956733704,
"learning_rate": 6.956994853954342e-06,
"loss": 0.3634,
"step": 195
},
{
"epoch": 0.8107549120992761,
"grad_norm": 0.5337055921554565,
"learning_rate": 6.925319105155165e-06,
"loss": 0.346,
"step": 196
},
{
"epoch": 0.8148914167528438,
"grad_norm": 0.4575997591018677,
"learning_rate": 6.8935523716585195e-06,
"loss": 0.3538,
"step": 197
},
{
"epoch": 0.8190279214064116,
"grad_norm": 0.5041812062263489,
"learning_rate": 6.8616961546631575e-06,
"loss": 0.3548,
"step": 198
},
{
"epoch": 0.8231644260599793,
"grad_norm": 0.4733670651912689,
"learning_rate": 6.829751959596544e-06,
"loss": 0.3414,
"step": 199
},
{
"epoch": 0.827300930713547,
"grad_norm": 0.48330968618392944,
"learning_rate": 6.797721296043727e-06,
"loss": 0.325,
"step": 200
},
{
"epoch": 0.8314374353671148,
"grad_norm": 0.4963349997997284,
"learning_rate": 6.765605677675982e-06,
"loss": 0.3858,
"step": 201
},
{
"epoch": 0.8355739400206825,
"grad_norm": 0.5333994626998901,
"learning_rate": 6.733406622179295e-06,
"loss": 0.3538,
"step": 202
},
{
"epoch": 0.8397104446742503,
"grad_norm": 0.4624415338039398,
"learning_rate": 6.701125651182631e-06,
"loss": 0.3025,
"step": 203
},
{
"epoch": 0.843846949327818,
"grad_norm": 0.45845648646354675,
"learning_rate": 6.668764290186039e-06,
"loss": 0.3458,
"step": 204
},
{
"epoch": 0.8479834539813857,
"grad_norm": 0.5057909488677979,
"learning_rate": 6.6363240684885465e-06,
"loss": 0.33,
"step": 205
},
{
"epoch": 0.8521199586349535,
"grad_norm": 0.5474227666854858,
"learning_rate": 6.603806519115899e-06,
"loss": 0.3386,
"step": 206
},
{
"epoch": 0.8562564632885212,
"grad_norm": 0.5117132067680359,
"learning_rate": 6.571213178748112e-06,
"loss": 0.3775,
"step": 207
},
{
"epoch": 0.860392967942089,
"grad_norm": 0.4669731557369232,
"learning_rate": 6.538545587646854e-06,
"loss": 0.3575,
"step": 208
},
{
"epoch": 0.8645294725956567,
"grad_norm": 0.4318840503692627,
"learning_rate": 6.50580528958265e-06,
"loss": 0.3201,
"step": 209
},
{
"epoch": 0.8686659772492245,
"grad_norm": 0.5034843683242798,
"learning_rate": 6.47299383176194e-06,
"loss": 0.3169,
"step": 210
},
{
"epoch": 0.8728024819027922,
"grad_norm": 0.5146070122718811,
"learning_rate": 6.440112764753956e-06,
"loss": 0.3653,
"step": 211
},
{
"epoch": 0.8769389865563598,
"grad_norm": 0.49277129769325256,
"learning_rate": 6.4071636424174435e-06,
"loss": 0.3485,
"step": 212
},
{
"epoch": 0.8810754912099276,
"grad_norm": 0.4620700776576996,
"learning_rate": 6.374148021827237e-06,
"loss": 0.3525,
"step": 213
},
{
"epoch": 0.8852119958634953,
"grad_norm": 0.5235023498535156,
"learning_rate": 6.341067463200678e-06,
"loss": 0.3638,
"step": 214
},
{
"epoch": 0.889348500517063,
"grad_norm": 0.4999266564846039,
"learning_rate": 6.307923529823876e-06,
"loss": 0.3692,
"step": 215
},
{
"epoch": 0.8934850051706308,
"grad_norm": 0.46116530895233154,
"learning_rate": 6.2747177879778424e-06,
"loss": 0.3316,
"step": 216
},
{
"epoch": 0.8976215098241985,
"grad_norm": 0.4651578664779663,
"learning_rate": 6.241451806864465e-06,
"loss": 0.3176,
"step": 217
},
{
"epoch": 0.9017580144777663,
"grad_norm": 0.45744726061820984,
"learning_rate": 6.208127158532358e-06,
"loss": 0.3261,
"step": 218
},
{
"epoch": 0.905894519131334,
"grad_norm": 0.4478837549686432,
"learning_rate": 6.174745417802563e-06,
"loss": 0.3357,
"step": 219
},
{
"epoch": 0.9100310237849017,
"grad_norm": 0.49428898096084595,
"learning_rate": 6.141308162194141e-06,
"loss": 0.321,
"step": 220
},
{
"epoch": 0.9141675284384695,
"grad_norm": 0.4366098642349243,
"learning_rate": 6.1078169718496164e-06,
"loss": 0.3132,
"step": 221
},
{
"epoch": 0.9183040330920372,
"grad_norm": 0.5066491365432739,
"learning_rate": 6.074273429460296e-06,
"loss": 0.3342,
"step": 222
},
{
"epoch": 0.922440537745605,
"grad_norm": 0.4254951775074005,
"learning_rate": 6.040679120191491e-06,
"loss": 0.3089,
"step": 223
},
{
"epoch": 0.9265770423991727,
"grad_norm": 0.46807774901390076,
"learning_rate": 6.007035631607605e-06,
"loss": 0.3182,
"step": 224
},
{
"epoch": 0.9307135470527405,
"grad_norm": 0.4610796570777893,
"learning_rate": 5.9733445535970915e-06,
"loss": 0.3239,
"step": 225
},
{
"epoch": 0.9348500517063082,
"grad_norm": 0.5245600342750549,
"learning_rate": 5.939607478297347e-06,
"loss": 0.3818,
"step": 226
},
{
"epoch": 0.9389865563598759,
"grad_norm": 0.45463472604751587,
"learning_rate": 5.905826000019458e-06,
"loss": 0.3109,
"step": 227
},
{
"epoch": 0.9431230610134437,
"grad_norm": 0.46084877848625183,
"learning_rate": 5.8720017151728526e-06,
"loss": 0.3475,
"step": 228
},
{
"epoch": 0.9472595656670114,
"grad_norm": 0.46296611428260803,
"learning_rate": 5.838136222189874e-06,
"loss": 0.3343,
"step": 229
},
{
"epoch": 0.9513960703205792,
"grad_norm": 0.458286315202713,
"learning_rate": 5.804231121450235e-06,
"loss": 0.3454,
"step": 230
},
{
"epoch": 0.9555325749741469,
"grad_norm": 0.42349058389663696,
"learning_rate": 5.770288015205385e-06,
"loss": 0.329,
"step": 231
},
{
"epoch": 0.9596690796277145,
"grad_norm": 0.4541251063346863,
"learning_rate": 5.736308507502805e-06,
"loss": 0.3296,
"step": 232
},
{
"epoch": 0.9638055842812823,
"grad_norm": 0.4887123107910156,
"learning_rate": 5.702294204110191e-06,
"loss": 0.3374,
"step": 233
},
{
"epoch": 0.96794208893485,
"grad_norm": 0.46135684847831726,
"learning_rate": 5.668246712439579e-06,
"loss": 0.3426,
"step": 234
},
{
"epoch": 0.9720785935884177,
"grad_norm": 0.4848094582557678,
"learning_rate": 5.634167641471383e-06,
"loss": 0.3626,
"step": 235
},
{
"epoch": 0.9762150982419855,
"grad_norm": 0.4424203932285309,
"learning_rate": 5.600058601678357e-06,
"loss": 0.302,
"step": 236
},
{
"epoch": 0.9803516028955532,
"grad_norm": 0.46382346749305725,
"learning_rate": 5.5659212049494915e-06,
"loss": 0.3357,
"step": 237
},
{
"epoch": 0.984488107549121,
"grad_norm": 0.4296742379665375,
"learning_rate": 5.531757064513837e-06,
"loss": 0.3162,
"step": 238
},
{
"epoch": 0.9886246122026887,
"grad_norm": 0.42605388164520264,
"learning_rate": 5.4975677948642704e-06,
"loss": 0.3204,
"step": 239
},
{
"epoch": 0.9927611168562565,
"grad_norm": 0.4539097547531128,
"learning_rate": 5.4633550116812e-06,
"loss": 0.327,
"step": 240
},
{
"epoch": 0.9968976215098242,
"grad_norm": 0.4806179404258728,
"learning_rate": 5.429120331756208e-06,
"loss": 0.3469,
"step": 241
},
{
"epoch": 1.001034126163392,
"grad_norm": 0.4494527280330658,
"learning_rate": 5.394865372915656e-06,
"loss": 0.3304,
"step": 242
},
{
"epoch": 1.0051706308169597,
"grad_norm": 0.5063448548316956,
"learning_rate": 5.360591753944221e-06,
"loss": 0.2792,
"step": 243
},
{
"epoch": 1.0093071354705274,
"grad_norm": 0.47153183817863464,
"learning_rate": 5.3263010945083994e-06,
"loss": 0.2593,
"step": 244
},
{
"epoch": 1.0134436401240952,
"grad_norm": 0.5729573369026184,
"learning_rate": 5.291995015079969e-06,
"loss": 0.2884,
"step": 245
},
{
"epoch": 1.017580144777663,
"grad_norm": 0.5748021602630615,
"learning_rate": 5.257675136859415e-06,
"loss": 0.2852,
"step": 246
},
{
"epoch": 1.0217166494312306,
"grad_norm": 0.49926644563674927,
"learning_rate": 5.223343081699302e-06,
"loss": 0.2947,
"step": 247
},
{
"epoch": 1.0258531540847984,
"grad_norm": 0.5036705732345581,
"learning_rate": 5.189000472027645e-06,
"loss": 0.2747,
"step": 248
},
{
"epoch": 1.0299896587383661,
"grad_norm": 0.557823896408081,
"learning_rate": 5.1546489307712345e-06,
"loss": 0.2724,
"step": 249
},
{
"epoch": 1.0341261633919339,
"grad_norm": 0.49561646580696106,
"learning_rate": 5.1202900812789346e-06,
"loss": 0.263,
"step": 250
},
{
"epoch": 1.0382626680455016,
"grad_norm": 0.46465885639190674,
"learning_rate": 5.085925547244978e-06,
"loss": 0.263,
"step": 251
},
{
"epoch": 1.0423991726990693,
"grad_norm": 0.5004085302352905,
"learning_rate": 5.051556952632235e-06,
"loss": 0.2831,
"step": 252
},
{
"epoch": 1.046535677352637,
"grad_norm": 0.5784794688224792,
"learning_rate": 5.0171859215954575e-06,
"loss": 0.2835,
"step": 253
},
{
"epoch": 1.0506721820062048,
"grad_norm": 0.4798305332660675,
"learning_rate": 4.982814078404543e-06,
"loss": 0.2382,
"step": 254
},
{
"epoch": 1.0548086866597726,
"grad_norm": 0.47284895181655884,
"learning_rate": 4.948443047367767e-06,
"loss": 0.2491,
"step": 255
},
{
"epoch": 1.0589451913133403,
"grad_norm": 0.4997791051864624,
"learning_rate": 4.9140744527550225e-06,
"loss": 0.2484,
"step": 256
},
{
"epoch": 1.063081695966908,
"grad_norm": 0.4812958836555481,
"learning_rate": 4.879709918721067e-06,
"loss": 0.2674,
"step": 257
},
{
"epoch": 1.0672182006204758,
"grad_norm": 0.4800451099872589,
"learning_rate": 4.845351069228767e-06,
"loss": 0.2625,
"step": 258
},
{
"epoch": 1.0713547052740435,
"grad_norm": 0.5013061165809631,
"learning_rate": 4.8109995279723556e-06,
"loss": 0.2739,
"step": 259
},
{
"epoch": 1.0754912099276113,
"grad_norm": 0.5202277898788452,
"learning_rate": 4.776656918300699e-06,
"loss": 0.2857,
"step": 260
},
{
"epoch": 1.079627714581179,
"grad_norm": 0.46747156977653503,
"learning_rate": 4.742324863140587e-06,
"loss": 0.2902,
"step": 261
},
{
"epoch": 1.0837642192347468,
"grad_norm": 0.4724840223789215,
"learning_rate": 4.70800498492003e-06,
"loss": 0.2845,
"step": 262
},
{
"epoch": 1.0879007238883145,
"grad_norm": 0.5077059864997864,
"learning_rate": 4.673698905491602e-06,
"loss": 0.297,
"step": 263
},
{
"epoch": 1.092037228541882,
"grad_norm": 0.4432675540447235,
"learning_rate": 4.639408246055781e-06,
"loss": 0.2286,
"step": 264
},
{
"epoch": 1.0961737331954498,
"grad_norm": 0.4326833188533783,
"learning_rate": 4.605134627084345e-06,
"loss": 0.2418,
"step": 265
},
{
"epoch": 1.1003102378490175,
"grad_norm": 0.4976271092891693,
"learning_rate": 4.570879668243792e-06,
"loss": 0.2825,
"step": 266
},
{
"epoch": 1.1044467425025852,
"grad_norm": 0.4635002613067627,
"learning_rate": 4.536644988318802e-06,
"loss": 0.2503,
"step": 267
},
{
"epoch": 1.108583247156153,
"grad_norm": 0.4908175766468048,
"learning_rate": 4.502432205135731e-06,
"loss": 0.298,
"step": 268
},
{
"epoch": 1.1127197518097207,
"grad_norm": 0.4961640238761902,
"learning_rate": 4.468242935486164e-06,
"loss": 0.2696,
"step": 269
},
{
"epoch": 1.1168562564632885,
"grad_norm": 0.49413740634918213,
"learning_rate": 4.434078795050509e-06,
"loss": 0.2938,
"step": 270
},
{
"epoch": 1.1209927611168562,
"grad_norm": 0.48604297637939453,
"learning_rate": 4.3999413983216434e-06,
"loss": 0.2884,
"step": 271
},
{
"epoch": 1.125129265770424,
"grad_norm": 0.4502314329147339,
"learning_rate": 4.365832358528618e-06,
"loss": 0.2514,
"step": 272
},
{
"epoch": 1.1292657704239917,
"grad_norm": 0.46243977546691895,
"learning_rate": 4.331753287560423e-06,
"loss": 0.2473,
"step": 273
},
{
"epoch": 1.1334022750775594,
"grad_norm": 0.48582252860069275,
"learning_rate": 4.29770579588981e-06,
"loss": 0.2926,
"step": 274
},
{
"epoch": 1.1375387797311272,
"grad_norm": 0.4945797622203827,
"learning_rate": 4.263691492497197e-06,
"loss": 0.2803,
"step": 275
},
{
"epoch": 1.141675284384695,
"grad_norm": 0.5017898082733154,
"learning_rate": 4.229711984794614e-06,
"loss": 0.2695,
"step": 276
},
{
"epoch": 1.1458117890382626,
"grad_norm": 0.44951367378234863,
"learning_rate": 4.195768878549766e-06,
"loss": 0.2548,
"step": 277
},
{
"epoch": 1.1499482936918304,
"grad_norm": 0.4264715611934662,
"learning_rate": 4.161863777810128e-06,
"loss": 0.2304,
"step": 278
},
{
"epoch": 1.1540847983453981,
"grad_norm": 0.4864782392978668,
"learning_rate": 4.127998284827148e-06,
"loss": 0.2883,
"step": 279
},
{
"epoch": 1.1582213029989659,
"grad_norm": 0.48877304792404175,
"learning_rate": 4.094173999980544e-06,
"loss": 0.2696,
"step": 280
},
{
"epoch": 1.1623578076525336,
"grad_norm": 0.4845278859138489,
"learning_rate": 4.060392521702655e-06,
"loss": 0.2696,
"step": 281
},
{
"epoch": 1.1664943123061013,
"grad_norm": 0.4687557816505432,
"learning_rate": 4.026655446402912e-06,
"loss": 0.2242,
"step": 282
},
{
"epoch": 1.170630816959669,
"grad_norm": 0.4510751962661743,
"learning_rate": 3.9929643683923965e-06,
"loss": 0.2534,
"step": 283
},
{
"epoch": 1.1747673216132368,
"grad_norm": 0.456969678401947,
"learning_rate": 3.9593208798085094e-06,
"loss": 0.239,
"step": 284
},
{
"epoch": 1.1789038262668046,
"grad_norm": 0.5285021066665649,
"learning_rate": 3.9257265705397065e-06,
"loss": 0.2706,
"step": 285
},
{
"epoch": 1.1830403309203723,
"grad_norm": 0.5108174085617065,
"learning_rate": 3.892183028150384e-06,
"loss": 0.292,
"step": 286
},
{
"epoch": 1.18717683557394,
"grad_norm": 0.4737439751625061,
"learning_rate": 3.8586918378058595e-06,
"loss": 0.2666,
"step": 287
},
{
"epoch": 1.1913133402275078,
"grad_norm": 0.46854445338249207,
"learning_rate": 3.8252545821974385e-06,
"loss": 0.2473,
"step": 288
},
{
"epoch": 1.1954498448810755,
"grad_norm": 0.5152525305747986,
"learning_rate": 3.791872841467643e-06,
"loss": 0.2787,
"step": 289
},
{
"epoch": 1.1995863495346433,
"grad_norm": 0.4602268636226654,
"learning_rate": 3.758548193135536e-06,
"loss": 0.2447,
"step": 290
},
{
"epoch": 1.203722854188211,
"grad_norm": 0.4676779806613922,
"learning_rate": 3.7252822120221592e-06,
"loss": 0.2715,
"step": 291
},
{
"epoch": 1.2078593588417788,
"grad_norm": 0.48289844393730164,
"learning_rate": 3.6920764701761263e-06,
"loss": 0.283,
"step": 292
},
{
"epoch": 1.2119958634953465,
"grad_norm": 0.4726490080356598,
"learning_rate": 3.6589325367993243e-06,
"loss": 0.2807,
"step": 293
},
{
"epoch": 1.2161323681489142,
"grad_norm": 0.5170783996582031,
"learning_rate": 3.625851978172765e-06,
"loss": 0.2636,
"step": 294
},
{
"epoch": 1.220268872802482,
"grad_norm": 0.46776092052459717,
"learning_rate": 3.59283635758256e-06,
"loss": 0.2457,
"step": 295
},
{
"epoch": 1.2244053774560497,
"grad_norm": 0.45310357213020325,
"learning_rate": 3.5598872352460457e-06,
"loss": 0.2538,
"step": 296
},
{
"epoch": 1.2285418821096175,
"grad_norm": 0.4700476825237274,
"learning_rate": 3.527006168238061e-06,
"loss": 0.2722,
"step": 297
},
{
"epoch": 1.2326783867631852,
"grad_norm": 0.4869045913219452,
"learning_rate": 3.4941947104173514e-06,
"loss": 0.2695,
"step": 298
},
{
"epoch": 1.236814891416753,
"grad_norm": 0.4840319752693176,
"learning_rate": 3.4614544123531476e-06,
"loss": 0.2671,
"step": 299
},
{
"epoch": 1.2409513960703205,
"grad_norm": 0.4536275565624237,
"learning_rate": 3.428786821251888e-06,
"loss": 0.2512,
"step": 300
},
{
"epoch": 1.2450879007238882,
"grad_norm": 0.4808495342731476,
"learning_rate": 3.3961934808841023e-06,
"loss": 0.2531,
"step": 301
},
{
"epoch": 1.249224405377456,
"grad_norm": 0.48514485359191895,
"learning_rate": 3.363675931511455e-06,
"loss": 0.2695,
"step": 302
},
{
"epoch": 1.2533609100310237,
"grad_norm": 0.48374173045158386,
"learning_rate": 3.331235709813962e-06,
"loss": 0.2706,
"step": 303
},
{
"epoch": 1.2574974146845914,
"grad_norm": 0.4591769278049469,
"learning_rate": 3.29887434881737e-06,
"loss": 0.2578,
"step": 304
},
{
"epoch": 1.2616339193381592,
"grad_norm": 0.4645506739616394,
"learning_rate": 3.2665933778207082e-06,
"loss": 0.2717,
"step": 305
},
{
"epoch": 1.265770423991727,
"grad_norm": 0.5053009986877441,
"learning_rate": 3.234394322324019e-06,
"loss": 0.2713,
"step": 306
},
{
"epoch": 1.2699069286452946,
"grad_norm": 0.46575117111206055,
"learning_rate": 3.2022787039562745e-06,
"loss": 0.2445,
"step": 307
},
{
"epoch": 1.2740434332988624,
"grad_norm": 0.4733026623725891,
"learning_rate": 3.170248040403457e-06,
"loss": 0.2602,
"step": 308
},
{
"epoch": 1.2781799379524301,
"grad_norm": 0.4547727406024933,
"learning_rate": 3.138303845336844e-06,
"loss": 0.2545,
"step": 309
},
{
"epoch": 1.2823164426059979,
"grad_norm": 0.5043481588363647,
"learning_rate": 3.1064476283414818e-06,
"loss": 0.2848,
"step": 310
},
{
"epoch": 1.2864529472595656,
"grad_norm": 0.49556779861450195,
"learning_rate": 3.074680894844837e-06,
"loss": 0.2659,
"step": 311
},
{
"epoch": 1.2905894519131333,
"grad_norm": 0.4662742614746094,
"learning_rate": 3.04300514604566e-06,
"loss": 0.2696,
"step": 312
},
{
"epoch": 1.294725956566701,
"grad_norm": 0.46650293469429016,
"learning_rate": 3.011421878843044e-06,
"loss": 0.2573,
"step": 313
},
{
"epoch": 1.2988624612202688,
"grad_norm": 0.471865177154541,
"learning_rate": 2.9799325857656856e-06,
"loss": 0.2598,
"step": 314
},
{
"epoch": 1.3029989658738366,
"grad_norm": 0.49665728211402893,
"learning_rate": 2.948538754901349e-06,
"loss": 0.285,
"step": 315
},
{
"epoch": 1.3071354705274043,
"grad_norm": 0.47322675585746765,
"learning_rate": 2.917241869826545e-06,
"loss": 0.2523,
"step": 316
},
{
"epoch": 1.311271975180972,
"grad_norm": 0.4811559021472931,
"learning_rate": 2.8860434095364266e-06,
"loss": 0.2762,
"step": 317
},
{
"epoch": 1.3154084798345398,
"grad_norm": 0.48528894782066345,
"learning_rate": 2.8549448483748888e-06,
"loss": 0.2812,
"step": 318
},
{
"epoch": 1.3195449844881075,
"grad_norm": 0.47023531794548035,
"learning_rate": 2.8239476559649013e-06,
"loss": 0.2857,
"step": 319
},
{
"epoch": 1.3236814891416753,
"grad_norm": 0.4679359793663025,
"learning_rate": 2.7930532971390543e-06,
"loss": 0.2639,
"step": 320
},
{
"epoch": 1.327817993795243,
"grad_norm": 0.4885619580745697,
"learning_rate": 2.762263231870339e-06,
"loss": 0.2919,
"step": 321
},
{
"epoch": 1.3319544984488108,
"grad_norm": 0.4528388977050781,
"learning_rate": 2.7315789152031504e-06,
"loss": 0.2491,
"step": 322
},
{
"epoch": 1.3360910031023785,
"grad_norm": 0.43351301550865173,
"learning_rate": 2.7010017971845267e-06,
"loss": 0.2334,
"step": 323
},
{
"epoch": 1.3402275077559462,
"grad_norm": 0.4791943430900574,
"learning_rate": 2.6705333227956304e-06,
"loss": 0.2759,
"step": 324
},
{
"epoch": 1.344364012409514,
"grad_norm": 0.42758721113204956,
"learning_rate": 2.6401749318834528e-06,
"loss": 0.2574,
"step": 325
},
{
"epoch": 1.3485005170630817,
"grad_norm": 0.4758831858634949,
"learning_rate": 2.609928059092779e-06,
"loss": 0.2459,
"step": 326
},
{
"epoch": 1.3526370217166495,
"grad_norm": 0.45820891857147217,
"learning_rate": 2.579794133798388e-06,
"loss": 0.2678,
"step": 327
},
{
"epoch": 1.3567735263702172,
"grad_norm": 0.4830312132835388,
"learning_rate": 2.549774580037504e-06,
"loss": 0.2627,
"step": 328
},
{
"epoch": 1.360910031023785,
"grad_norm": 0.48166197538375854,
"learning_rate": 2.5198708164425046e-06,
"loss": 0.2524,
"step": 329
},
{
"epoch": 1.3650465356773527,
"grad_norm": 0.48581820726394653,
"learning_rate": 2.4900842561738736e-06,
"loss": 0.2527,
"step": 330
},
{
"epoch": 1.3691830403309204,
"grad_norm": 0.49055343866348267,
"learning_rate": 2.4604163068534313e-06,
"loss": 0.2541,
"step": 331
},
{
"epoch": 1.3733195449844882,
"grad_norm": 0.4789126217365265,
"learning_rate": 2.4308683704978e-06,
"loss": 0.2597,
"step": 332
},
{
"epoch": 1.377456049638056,
"grad_norm": 0.5069748759269714,
"learning_rate": 2.401441843452159e-06,
"loss": 0.2842,
"step": 333
},
{
"epoch": 1.3815925542916236,
"grad_norm": 0.49611082673072815,
"learning_rate": 2.372138116324254e-06,
"loss": 0.2648,
"step": 334
},
{
"epoch": 1.3857290589451914,
"grad_norm": 0.4773513376712799,
"learning_rate": 2.342958573918682e-06,
"loss": 0.2846,
"step": 335
},
{
"epoch": 1.3898655635987591,
"grad_norm": 0.49834293127059937,
"learning_rate": 2.3139045951714473e-06,
"loss": 0.288,
"step": 336
},
{
"epoch": 1.3940020682523269,
"grad_norm": 0.46547529101371765,
"learning_rate": 2.2849775530848057e-06,
"loss": 0.242,
"step": 337
},
{
"epoch": 1.3981385729058946,
"grad_norm": 0.46302109956741333,
"learning_rate": 2.256178814662368e-06,
"loss": 0.2553,
"step": 338
},
{
"epoch": 1.4022750775594623,
"grad_norm": 0.5011090636253357,
"learning_rate": 2.227509740844508e-06,
"loss": 0.281,
"step": 339
},
{
"epoch": 1.40641158221303,
"grad_norm": 0.43727773427963257,
"learning_rate": 2.198971686444047e-06,
"loss": 0.2409,
"step": 340
},
{
"epoch": 1.4105480868665978,
"grad_norm": 0.4928194582462311,
"learning_rate": 2.1705660000822286e-06,
"loss": 0.299,
"step": 341
},
{
"epoch": 1.4146845915201656,
"grad_norm": 0.4691973328590393,
"learning_rate": 2.1422940241249875e-06,
"loss": 0.2552,
"step": 342
},
{
"epoch": 1.4188210961737333,
"grad_norm": 0.4593028724193573,
"learning_rate": 2.1141570946195106e-06,
"loss": 0.255,
"step": 343
},
{
"epoch": 1.422957600827301,
"grad_norm": 0.4884447753429413,
"learning_rate": 2.086156541231109e-06,
"loss": 0.2601,
"step": 344
},
{
"epoch": 1.4270941054808688,
"grad_norm": 0.5032863616943359,
"learning_rate": 2.0582936871803692e-06,
"loss": 0.2888,
"step": 345
},
{
"epoch": 1.4312306101344365,
"grad_norm": 0.46444255113601685,
"learning_rate": 2.0305698491806297e-06,
"loss": 0.2402,
"step": 346
},
{
"epoch": 1.4353671147880043,
"grad_norm": 0.4745306074619293,
"learning_rate": 2.0029863373757553e-06,
"loss": 0.2665,
"step": 347
},
{
"epoch": 1.4395036194415718,
"grad_norm": 0.4591853618621826,
"learning_rate": 1.9755444552782228e-06,
"loss": 0.2209,
"step": 348
},
{
"epoch": 1.4436401240951395,
"grad_norm": 0.4660813808441162,
"learning_rate": 1.948245499707523e-06,
"loss": 0.2559,
"step": 349
},
{
"epoch": 1.4477766287487073,
"grad_norm": 0.4885343909263611,
"learning_rate": 1.9210907607288728e-06,
"loss": 0.281,
"step": 350
},
{
"epoch": 1.451913133402275,
"grad_norm": 0.4733608365058899,
"learning_rate": 1.8940815215922609e-06,
"loss": 0.2762,
"step": 351
},
{
"epoch": 1.4560496380558428,
"grad_norm": 0.46303790807724,
"learning_rate": 1.867219058671791e-06,
"loss": 0.2626,
"step": 352
},
{
"epoch": 1.4601861427094105,
"grad_norm": 0.4606141149997711,
"learning_rate": 1.8405046414053728e-06,
"loss": 0.2434,
"step": 353
},
{
"epoch": 1.4643226473629782,
"grad_norm": 0.4833987057209015,
"learning_rate": 1.8139395322347335e-06,
"loss": 0.2546,
"step": 354
},
{
"epoch": 1.468459152016546,
"grad_norm": 0.44073909521102905,
"learning_rate": 1.787524986545753e-06,
"loss": 0.2511,
"step": 355
},
{
"epoch": 1.4725956566701137,
"grad_norm": 0.45749855041503906,
"learning_rate": 1.7612622526091406e-06,
"loss": 0.2391,
"step": 356
},
{
"epoch": 1.4767321613236815,
"grad_norm": 0.4941197633743286,
"learning_rate": 1.7351525715214512e-06,
"loss": 0.2607,
"step": 357
},
{
"epoch": 1.4808686659772492,
"grad_norm": 0.4428948760032654,
"learning_rate": 1.709197177146425e-06,
"loss": 0.2477,
"step": 358
},
{
"epoch": 1.485005170630817,
"grad_norm": 0.5241663455963135,
"learning_rate": 1.6833972960566868e-06,
"loss": 0.258,
"step": 359
},
{
"epoch": 1.4891416752843847,
"grad_norm": 0.48429349064826965,
"learning_rate": 1.6577541474757712e-06,
"loss": 0.2709,
"step": 360
},
{
"epoch": 1.4932781799379524,
"grad_norm": 0.49429482221603394,
"learning_rate": 1.6322689432205252e-06,
"loss": 0.2787,
"step": 361
},
{
"epoch": 1.4974146845915202,
"grad_norm": 0.487251341342926,
"learning_rate": 1.6069428876438203e-06,
"loss": 0.2612,
"step": 362
},
{
"epoch": 1.501551189245088,
"grad_norm": 0.4490973949432373,
"learning_rate": 1.5817771775776508e-06,
"loss": 0.2516,
"step": 363
},
{
"epoch": 1.5056876938986556,
"grad_norm": 0.48428958654403687,
"learning_rate": 1.5567730022765753e-06,
"loss": 0.2773,
"step": 364
},
{
"epoch": 1.5098241985522234,
"grad_norm": 0.4671989381313324,
"learning_rate": 1.5319315433615101e-06,
"loss": 0.267,
"step": 365
},
{
"epoch": 1.5139607032057911,
"grad_norm": 0.5202347636222839,
"learning_rate": 1.5072539747638887e-06,
"loss": 0.294,
"step": 366
},
{
"epoch": 1.5180972078593589,
"grad_norm": 0.45074182748794556,
"learning_rate": 1.482741462670193e-06,
"loss": 0.2363,
"step": 367
},
{
"epoch": 1.5222337125129266,
"grad_norm": 0.5174707174301147,
"learning_rate": 1.4583951654668416e-06,
"loss": 0.2767,
"step": 368
},
{
"epoch": 1.5263702171664943,
"grad_norm": 0.4842411279678345,
"learning_rate": 1.434216233685441e-06,
"loss": 0.2858,
"step": 369
},
{
"epoch": 1.530506721820062,
"grad_norm": 0.4314868450164795,
"learning_rate": 1.4102058099484188e-06,
"loss": 0.2356,
"step": 370
},
{
"epoch": 1.5346432264736298,
"grad_norm": 0.47764474153518677,
"learning_rate": 1.3863650289150338e-06,
"loss": 0.2632,
"step": 371
},
{
"epoch": 1.5387797311271976,
"grad_norm": 0.4491686522960663,
"learning_rate": 1.3626950172277398e-06,
"loss": 0.2443,
"step": 372
},
{
"epoch": 1.542916235780765,
"grad_norm": 0.4948718249797821,
"learning_rate": 1.3391968934589573e-06,
"loss": 0.2772,
"step": 373
},
{
"epoch": 1.5470527404343328,
"grad_norm": 0.4696827828884125,
"learning_rate": 1.3158717680582128e-06,
"loss": 0.2568,
"step": 374
},
{
"epoch": 1.5511892450879006,
"grad_norm": 0.43921521306037903,
"learning_rate": 1.292720743299654e-06,
"loss": 0.229,
"step": 375
},
{
"epoch": 1.5553257497414683,
"grad_norm": 0.4529230296611786,
"learning_rate": 1.2697449132299649e-06,
"loss": 0.2445,
"step": 376
},
{
"epoch": 1.559462254395036,
"grad_norm": 0.4936879873275757,
"learning_rate": 1.2469453636166645e-06,
"loss": 0.2579,
"step": 377
},
{
"epoch": 1.5635987590486038,
"grad_norm": 0.48198625445365906,
"learning_rate": 1.224323171896797e-06,
"loss": 0.2542,
"step": 378
},
{
"epoch": 1.5677352637021715,
"grad_norm": 0.4886031448841095,
"learning_rate": 1.201879407126012e-06,
"loss": 0.2707,
"step": 379
},
{
"epoch": 1.5718717683557393,
"grad_norm": 0.47718653082847595,
"learning_rate": 1.1796151299280483e-06,
"loss": 0.2747,
"step": 380
},
{
"epoch": 1.576008273009307,
"grad_norm": 0.46224093437194824,
"learning_rate": 1.1575313924446123e-06,
"loss": 0.247,
"step": 381
},
{
"epoch": 1.5801447776628748,
"grad_norm": 0.4610908329486847,
"learning_rate": 1.1356292382856531e-06,
"loss": 0.2624,
"step": 382
},
{
"epoch": 1.5842812823164425,
"grad_norm": 0.46388930082321167,
"learning_rate": 1.113909702480046e-06,
"loss": 0.2485,
"step": 383
},
{
"epoch": 1.5884177869700102,
"grad_norm": 0.4562687575817108,
"learning_rate": 1.0923738114266824e-06,
"loss": 0.2503,
"step": 384
},
{
"epoch": 1.592554291623578,
"grad_norm": 0.44876885414123535,
"learning_rate": 1.0710225828459642e-06,
"loss": 0.2453,
"step": 385
},
{
"epoch": 1.5966907962771457,
"grad_norm": 0.45502784848213196,
"learning_rate": 1.0498570257317075e-06,
"loss": 0.2595,
"step": 386
},
{
"epoch": 1.6008273009307135,
"grad_norm": 0.47724854946136475,
"learning_rate": 1.028878140303462e-06,
"loss": 0.2541,
"step": 387
},
{
"epoch": 1.6049638055842812,
"grad_norm": 0.45897573232650757,
"learning_rate": 1.008086917959249e-06,
"loss": 0.2628,
"step": 388
},
{
"epoch": 1.609100310237849,
"grad_norm": 0.4865526258945465,
"learning_rate": 9.874843412286994e-07,
"loss": 0.2693,
"step": 389
},
{
"epoch": 1.6132368148914167,
"grad_norm": 0.46964144706726074,
"learning_rate": 9.670713837266322e-07,
"loss": 0.2498,
"step": 390
},
{
"epoch": 1.6173733195449844,
"grad_norm": 0.42305079102516174,
"learning_rate": 9.46849010107041e-07,
"loss": 0.2262,
"step": 391
},
{
"epoch": 1.6215098241985522,
"grad_norm": 0.4819132089614868,
"learning_rate": 9.26818176017506e-07,
"loss": 0.2617,
"step": 392
},
{
"epoch": 1.62564632885212,
"grad_norm": 0.4843488037586212,
"learning_rate": 9.069798280540348e-07,
"loss": 0.2636,
"step": 393
},
{
"epoch": 1.6297828335056876,
"grad_norm": 0.4789119064807892,
"learning_rate": 8.87334903716332e-07,
"loss": 0.2869,
"step": 394
},
{
"epoch": 1.6339193381592554,
"grad_norm": 0.42331403493881226,
"learning_rate": 8.678843313634894e-07,
"loss": 0.2192,
"step": 395
},
{
"epoch": 1.6380558428128231,
"grad_norm": 0.45914411544799805,
"learning_rate": 8.486290301701183e-07,
"loss": 0.2654,
"step": 396
},
{
"epoch": 1.6421923474663909,
"grad_norm": 0.4775830805301666,
"learning_rate": 8.295699100829124e-07,
"loss": 0.2434,
"step": 397
},
{
"epoch": 1.6463288521199586,
"grad_norm": 0.5007808804512024,
"learning_rate": 8.107078717776457e-07,
"loss": 0.2697,
"step": 398
},
{
"epoch": 1.6504653567735263,
"grad_norm": 0.4754742681980133,
"learning_rate": 7.920438066166097e-07,
"loss": 0.2626,
"step": 399
},
{
"epoch": 1.654601861427094,
"grad_norm": 0.46346259117126465,
"learning_rate": 7.735785966064885e-07,
"loss": 0.2268,
"step": 400
},
{
"epoch": 1.6587383660806618,
"grad_norm": 0.4413525462150574,
"learning_rate": 7.553131143566822e-07,
"loss": 0.2373,
"step": 401
},
{
"epoch": 1.6628748707342296,
"grad_norm": 0.447625994682312,
"learning_rate": 7.372482230380657e-07,
"loss": 0.2546,
"step": 402
},
{
"epoch": 1.6670113753877973,
"grad_norm": 0.4605792462825775,
"learning_rate": 7.193847763421991e-07,
"loss": 0.2656,
"step": 403
},
{
"epoch": 1.671147880041365,
"grad_norm": 0.4576088786125183,
"learning_rate": 7.017236184409859e-07,
"loss": 0.2576,
"step": 404
},
{
"epoch": 1.6752843846949328,
"grad_norm": 0.5075780153274536,
"learning_rate": 6.842655839467787e-07,
"loss": 0.3023,
"step": 405
},
{
"epoch": 1.6794208893485005,
"grad_norm": 0.4650248885154724,
"learning_rate": 6.670114978729392e-07,
"loss": 0.2753,
"step": 406
},
{
"epoch": 1.6835573940020683,
"grad_norm": 0.4480326175689697,
"learning_rate": 6.499621755948487e-07,
"loss": 0.2448,
"step": 407
},
{
"epoch": 1.687693898655636,
"grad_norm": 0.48435285687446594,
"learning_rate": 6.331184228113801e-07,
"loss": 0.2729,
"step": 408
},
{
"epoch": 1.6918304033092038,
"grad_norm": 0.4679297208786011,
"learning_rate": 6.164810355068179e-07,
"loss": 0.2394,
"step": 409
},
{
"epoch": 1.6959669079627715,
"grad_norm": 0.5232973694801331,
"learning_rate": 6.000507999132444e-07,
"loss": 0.2761,
"step": 410
},
{
"epoch": 1.7001034126163392,
"grad_norm": 0.43717169761657715,
"learning_rate": 5.838284924733866e-07,
"loss": 0.2476,
"step": 411
},
{
"epoch": 1.704239917269907,
"grad_norm": 0.4989730417728424,
"learning_rate": 5.678148798039213e-07,
"loss": 0.2723,
"step": 412
},
{
"epoch": 1.7083764219234747,
"grad_norm": 0.4776909649372101,
"learning_rate": 5.520107186592477e-07,
"loss": 0.2394,
"step": 413
},
{
"epoch": 1.7125129265770425,
"grad_norm": 0.49704718589782715,
"learning_rate": 5.364167558957267e-07,
"loss": 0.2674,
"step": 414
},
{
"epoch": 1.7166494312306102,
"grad_norm": 0.5080196857452393,
"learning_rate": 5.210337284363876e-07,
"loss": 0.2846,
"step": 415
},
{
"epoch": 1.720785935884178,
"grad_norm": 0.5011091828346252,
"learning_rate": 5.058623632361004e-07,
"loss": 0.276,
"step": 416
},
{
"epoch": 1.7249224405377457,
"grad_norm": 0.4899991750717163,
"learning_rate": 4.909033772472204e-07,
"loss": 0.2465,
"step": 417
},
{
"epoch": 1.7290589451913134,
"grad_norm": 0.47677579522132874,
"learning_rate": 4.7615747738571636e-07,
"loss": 0.2547,
"step": 418
},
{
"epoch": 1.7331954498448812,
"grad_norm": 0.4679698050022125,
"learning_rate": 4.6162536049775387e-07,
"loss": 0.2687,
"step": 419
},
{
"epoch": 1.737331954498449,
"grad_norm": 0.4611322283744812,
"learning_rate": 4.473077133267684e-07,
"loss": 0.2517,
"step": 420
},
{
"epoch": 1.7414684591520166,
"grad_norm": 0.45688915252685547,
"learning_rate": 4.3320521248101487e-07,
"loss": 0.2449,
"step": 421
},
{
"epoch": 1.7456049638055844,
"grad_norm": 0.44202756881713867,
"learning_rate": 4.193185244015879e-07,
"loss": 0.2274,
"step": 422
},
{
"epoch": 1.7497414684591521,
"grad_norm": 0.488298237323761,
"learning_rate": 4.0564830533093014e-07,
"loss": 0.2706,
"step": 423
},
{
"epoch": 1.7538779731127199,
"grad_norm": 0.44502395391464233,
"learning_rate": 3.9219520128182087e-07,
"loss": 0.2343,
"step": 424
},
{
"epoch": 1.7580144777662876,
"grad_norm": 0.4559187889099121,
"learning_rate": 3.789598480068479e-07,
"loss": 0.2477,
"step": 425
},
{
"epoch": 1.7621509824198553,
"grad_norm": 0.43528175354003906,
"learning_rate": 3.659428709683621e-07,
"loss": 0.2279,
"step": 426
},
{
"epoch": 1.766287487073423,
"grad_norm": 0.47880756855010986,
"learning_rate": 3.531448853089192e-07,
"loss": 0.2631,
"step": 427
},
{
"epoch": 1.7704239917269908,
"grad_norm": 0.49789199233055115,
"learning_rate": 3.40566495822216e-07,
"loss": 0.2925,
"step": 428
},
{
"epoch": 1.7745604963805586,
"grad_norm": 0.4378401041030884,
"learning_rate": 3.2820829692449984e-07,
"loss": 0.227,
"step": 429
},
{
"epoch": 1.7786970010341263,
"grad_norm": 0.4724928140640259,
"learning_rate": 3.160708726264855e-07,
"loss": 0.2657,
"step": 430
},
{
"epoch": 1.782833505687694,
"grad_norm": 0.43662911653518677,
"learning_rate": 3.0415479650575783e-07,
"loss": 0.2399,
"step": 431
},
{
"epoch": 1.7869700103412618,
"grad_norm": 0.46386146545410156,
"learning_rate": 2.9246063167965963e-07,
"loss": 0.2447,
"step": 432
},
{
"epoch": 1.7911065149948295,
"grad_norm": 0.47366079688072205,
"learning_rate": 2.809889307786856e-07,
"loss": 0.2449,
"step": 433
},
{
"epoch": 1.795243019648397,
"grad_norm": 0.4846685826778412,
"learning_rate": 2.697402359203638e-07,
"loss": 0.2559,
"step": 434
},
{
"epoch": 1.7993795243019648,
"grad_norm": 0.4788161516189575,
"learning_rate": 2.587150786836407e-07,
"loss": 0.2749,
"step": 435
},
{
"epoch": 1.8035160289555325,
"grad_norm": 0.49820560216903687,
"learning_rate": 2.4791398008375545e-07,
"loss": 0.2748,
"step": 436
},
{
"epoch": 1.8076525336091003,
"grad_norm": 0.45833131670951843,
"learning_rate": 2.3733745054762059e-07,
"loss": 0.2293,
"step": 437
},
{
"epoch": 1.811789038262668,
"grad_norm": 0.5000050067901611,
"learning_rate": 2.2698598988970422e-07,
"loss": 0.2634,
"step": 438
},
{
"epoch": 1.8159255429162358,
"grad_norm": 0.45837461948394775,
"learning_rate": 2.1686008728840301e-07,
"loss": 0.2525,
"step": 439
},
{
"epoch": 1.8200620475698035,
"grad_norm": 0.4396543800830841,
"learning_rate": 2.0696022126293126e-07,
"loss": 0.2374,
"step": 440
},
{
"epoch": 1.8241985522233712,
"grad_norm": 0.4914761483669281,
"learning_rate": 1.9728685965070604e-07,
"loss": 0.2992,
"step": 441
},
{
"epoch": 1.828335056876939,
"grad_norm": 0.5126286745071411,
"learning_rate": 1.8784045958523623e-07,
"loss": 0.2795,
"step": 442
},
{
"epoch": 1.8324715615305067,
"grad_norm": 0.44213420152664185,
"learning_rate": 1.786214674745218e-07,
"loss": 0.2247,
"step": 443
},
{
"epoch": 1.8366080661840745,
"grad_norm": 0.4569559693336487,
"learning_rate": 1.6963031897995863e-07,
"loss": 0.2451,
"step": 444
},
{
"epoch": 1.8407445708376422,
"grad_norm": 0.4845653474330902,
"learning_rate": 1.6086743899575042e-07,
"loss": 0.2818,
"step": 445
},
{
"epoch": 1.84488107549121,
"grad_norm": 0.4564604163169861,
"learning_rate": 1.523332416288259e-07,
"loss": 0.2539,
"step": 446
},
{
"epoch": 1.8490175801447777,
"grad_norm": 0.4548117518424988,
"learning_rate": 1.4402813017927396e-07,
"loss": 0.2554,
"step": 447
},
{
"epoch": 1.8531540847983454,
"grad_norm": 0.4759480655193329,
"learning_rate": 1.3595249712128334e-07,
"loss": 0.2661,
"step": 448
},
{
"epoch": 1.8572905894519132,
"grad_norm": 0.46541112661361694,
"learning_rate": 1.28106724084594e-07,
"loss": 0.2486,
"step": 449
},
{
"epoch": 1.861427094105481,
"grad_norm": 0.4635773003101349,
"learning_rate": 1.2049118183646403e-07,
"loss": 0.2653,
"step": 450
},
{
"epoch": 1.8655635987590486,
"grad_norm": 0.44061151146888733,
"learning_rate": 1.1310623026414891e-07,
"loss": 0.2255,
"step": 451
},
{
"epoch": 1.8697001034126164,
"grad_norm": 0.45572927594184875,
"learning_rate": 1.059522183578926e-07,
"loss": 0.2533,
"step": 452
},
{
"epoch": 1.8738366080661841,
"grad_norm": 0.4822574853897095,
"learning_rate": 9.902948419443669e-08,
"loss": 0.2767,
"step": 453
},
{
"epoch": 1.8779731127197516,
"grad_norm": 0.4398654103279114,
"learning_rate": 9.233835492104326e-08,
"loss": 0.2492,
"step": 454
},
{
"epoch": 1.8821096173733194,
"grad_norm": 0.4548628032207489,
"learning_rate": 8.587914674003384e-08,
"loss": 0.254,
"step": 455
},
{
"epoch": 1.8862461220268871,
"grad_norm": 0.45040181279182434,
"learning_rate": 7.965216489384919e-08,
"loss": 0.2721,
"step": 456
},
{
"epoch": 1.8903826266804549,
"grad_norm": 0.47080284357070923,
"learning_rate": 7.365770365062308e-08,
"loss": 0.2718,
"step": 457
},
{
"epoch": 1.8945191313340226,
"grad_norm": 0.48404160141944885,
"learning_rate": 6.789604629027614e-08,
"loss": 0.2924,
"step": 458
},
{
"epoch": 1.8986556359875904,
"grad_norm": 0.46306654810905457,
"learning_rate": 6.236746509112824e-08,
"loss": 0.2531,
"step": 459
},
{
"epoch": 1.902792140641158,
"grad_norm": 0.4330954849720001,
"learning_rate": 5.707222131703216e-08,
"loss": 0.2388,
"step": 460
},
{
"epoch": 1.9069286452947258,
"grad_norm": 0.46021175384521484,
"learning_rate": 5.201056520502734e-08,
"loss": 0.2468,
"step": 461
},
{
"epoch": 1.9110651499482936,
"grad_norm": 0.5022516250610352,
"learning_rate": 4.718273595351486e-08,
"loss": 0.263,
"step": 462
},
{
"epoch": 1.9152016546018613,
"grad_norm": 0.47739377617836,
"learning_rate": 4.25889617109515e-08,
"loss": 0.2718,
"step": 463
},
{
"epoch": 1.919338159255429,
"grad_norm": 0.4588397741317749,
"learning_rate": 3.8229459565070074e-08,
"loss": 0.2412,
"step": 464
},
{
"epoch": 1.9234746639089968,
"grad_norm": 0.4719136953353882,
"learning_rate": 3.410443553262033e-08,
"loss": 0.2722,
"step": 465
},
{
"epoch": 1.9276111685625645,
"grad_norm": 0.4567975401878357,
"learning_rate": 3.0214084549632925e-08,
"loss": 0.2536,
"step": 466
},
{
"epoch": 1.9317476732161323,
"grad_norm": 0.4981779158115387,
"learning_rate": 2.6558590462207322e-08,
"loss": 0.27,
"step": 467
},
{
"epoch": 1.9358841778697,
"grad_norm": 0.4779011309146881,
"learning_rate": 2.3138126017822614e-08,
"loss": 0.2707,
"step": 468
},
{
"epoch": 1.9400206825232678,
"grad_norm": 0.4619957506656647,
"learning_rate": 1.99528528571763e-08,
"loss": 0.2516,
"step": 469
},
{
"epoch": 1.9441571871768355,
"grad_norm": 0.47019270062446594,
"learning_rate": 1.7002921506544812e-08,
"loss": 0.2762,
"step": 470
},
{
"epoch": 1.9482936918304032,
"grad_norm": 0.48734498023986816,
"learning_rate": 1.4288471370669244e-08,
"loss": 0.2779,
"step": 471
},
{
"epoch": 1.952430196483971,
"grad_norm": 0.5020056366920471,
"learning_rate": 1.1809630726167808e-08,
"loss": 0.2731,
"step": 472
},
{
"epoch": 1.9565667011375387,
"grad_norm": 0.4687701165676117,
"learning_rate": 9.566516715474594e-09,
"loss": 0.2584,
"step": 473
},
{
"epoch": 1.9607032057911065,
"grad_norm": 0.4735799729824066,
"learning_rate": 7.559235341302872e-09,
"loss": 0.2663,
"step": 474
},
{
"epoch": 1.9648397104446742,
"grad_norm": 0.4657973349094391,
"learning_rate": 5.787881461636891e-09,
"loss": 0.2597,
"step": 475
},
{
"epoch": 1.968976215098242,
"grad_norm": 0.43754643201828003,
"learning_rate": 4.252538785248228e-09,
"loss": 0.2198,
"step": 476
},
{
"epoch": 1.9731127197518097,
"grad_norm": 0.45479777455329895,
"learning_rate": 2.9532798677395226e-09,
"loss": 0.2456,
"step": 477
},
{
"epoch": 1.9772492244053774,
"grad_norm": 0.4745938181877136,
"learning_rate": 1.8901661081172084e-09,
"loss": 0.2719,
"step": 478
},
{
"epoch": 1.9813857290589452,
"grad_norm": 0.4496646225452423,
"learning_rate": 1.0632477458888401e-09,
"loss": 0.2545,
"step": 479
},
{
"epoch": 1.985522233712513,
"grad_norm": 0.5044782757759094,
"learning_rate": 4.725638586894344e-10,
"loss": 0.2904,
"step": 480
},
{
"epoch": 1.9896587383660806,
"grad_norm": 0.45781707763671875,
"learning_rate": 1.1814236043405924e-10,
"loss": 0.2429,
"step": 481
},
{
"epoch": 1.9937952430196484,
"grad_norm": 0.4693934917449951,
"learning_rate": 0.0,
"loss": 0.2602,
"step": 482
},
{
"epoch": 1.9937952430196484,
"step": 482,
"total_flos": 3.6089290785072087e+18,
"train_loss": 0.3119487636316861,
"train_runtime": 2571.753,
"train_samples_per_second": 24.051,
"train_steps_per_second": 0.187
}
],
"logging_steps": 1,
"max_steps": 482,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.6089290785072087e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}