kalomaze's picture
Upload folder using huggingface_hub
c1edf8c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.981177899210686,
"eval_steps": 26,
"global_step": 822,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0024286581663630845,
"grad_norm": 3.421875,
"learning_rate": 1.25e-06,
"loss": 0.9095,
"step": 1
},
{
"epoch": 0.0024286581663630845,
"eval_loss": 0.8089314699172974,
"eval_runtime": 98.8099,
"eval_samples_per_second": 30.361,
"eval_steps_per_second": 3.795,
"step": 1
},
{
"epoch": 0.004857316332726169,
"grad_norm": 3.40625,
"learning_rate": 2.5e-06,
"loss": 0.8146,
"step": 2
},
{
"epoch": 0.007285974499089253,
"grad_norm": 3.265625,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.806,
"step": 3
},
{
"epoch": 0.009714632665452338,
"grad_norm": 2.453125,
"learning_rate": 5e-06,
"loss": 0.781,
"step": 4
},
{
"epoch": 0.012143290831815421,
"grad_norm": 2.015625,
"learning_rate": 6.25e-06,
"loss": 0.7774,
"step": 5
},
{
"epoch": 0.014571948998178506,
"grad_norm": 1.953125,
"learning_rate": 7.500000000000001e-06,
"loss": 0.776,
"step": 6
},
{
"epoch": 0.01700060716454159,
"grad_norm": 2.03125,
"learning_rate": 8.750000000000001e-06,
"loss": 0.7554,
"step": 7
},
{
"epoch": 0.019429265330904676,
"grad_norm": 1.1640625,
"learning_rate": 1e-05,
"loss": 0.7362,
"step": 8
},
{
"epoch": 0.02185792349726776,
"grad_norm": 0.97265625,
"learning_rate": 1.125e-05,
"loss": 0.7365,
"step": 9
},
{
"epoch": 0.024286581663630843,
"grad_norm": 1.1640625,
"learning_rate": 1.25e-05,
"loss": 0.7183,
"step": 10
},
{
"epoch": 0.02671523982999393,
"grad_norm": 1.1953125,
"learning_rate": 1.375e-05,
"loss": 0.7153,
"step": 11
},
{
"epoch": 0.029143897996357013,
"grad_norm": 1.2109375,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.8111,
"step": 12
},
{
"epoch": 0.031572556162720096,
"grad_norm": 0.78125,
"learning_rate": 1.6250000000000002e-05,
"loss": 0.6966,
"step": 13
},
{
"epoch": 0.03400121432908318,
"grad_norm": 0.640625,
"learning_rate": 1.7500000000000002e-05,
"loss": 0.7068,
"step": 14
},
{
"epoch": 0.03642987249544627,
"grad_norm": 0.6484375,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.6915,
"step": 15
},
{
"epoch": 0.03885853066180935,
"grad_norm": 0.6796875,
"learning_rate": 2e-05,
"loss": 0.6878,
"step": 16
},
{
"epoch": 0.041287188828172436,
"grad_norm": 0.61328125,
"learning_rate": 1.999992403752328e-05,
"loss": 0.6902,
"step": 17
},
{
"epoch": 0.04371584699453552,
"grad_norm": 0.55859375,
"learning_rate": 1.999969615124717e-05,
"loss": 0.6818,
"step": 18
},
{
"epoch": 0.0461445051608986,
"grad_norm": 0.52734375,
"learning_rate": 1.999931634463383e-05,
"loss": 0.6732,
"step": 19
},
{
"epoch": 0.048573163327261686,
"grad_norm": 0.47265625,
"learning_rate": 1.9998784623453477e-05,
"loss": 0.6693,
"step": 20
},
{
"epoch": 0.051001821493624776,
"grad_norm": 0.47265625,
"learning_rate": 1.999810099578428e-05,
"loss": 0.6663,
"step": 21
},
{
"epoch": 0.05343047965998786,
"grad_norm": 0.72265625,
"learning_rate": 1.9997265472012247e-05,
"loss": 0.7473,
"step": 22
},
{
"epoch": 0.05585913782635094,
"grad_norm": 0.462890625,
"learning_rate": 1.999627806483107e-05,
"loss": 0.6457,
"step": 23
},
{
"epoch": 0.058287795992714025,
"grad_norm": 0.70703125,
"learning_rate": 1.999513878924193e-05,
"loss": 0.7388,
"step": 24
},
{
"epoch": 0.06071645415907711,
"grad_norm": 0.447265625,
"learning_rate": 1.9993847662553264e-05,
"loss": 0.6505,
"step": 25
},
{
"epoch": 0.06314511232544019,
"grad_norm": 0.4453125,
"learning_rate": 1.9992404704380513e-05,
"loss": 0.6388,
"step": 26
},
{
"epoch": 0.06314511232544019,
"eval_loss": 0.649046003818512,
"eval_runtime": 97.2348,
"eval_samples_per_second": 30.853,
"eval_steps_per_second": 3.857,
"step": 26
},
{
"epoch": 0.06557377049180328,
"grad_norm": 0.416015625,
"learning_rate": 1.9990809936645804e-05,
"loss": 0.6507,
"step": 27
},
{
"epoch": 0.06800242865816636,
"grad_norm": 0.42578125,
"learning_rate": 1.9989063383577644e-05,
"loss": 0.6536,
"step": 28
},
{
"epoch": 0.07043108682452945,
"grad_norm": 0.42578125,
"learning_rate": 1.998716507171053e-05,
"loss": 0.6508,
"step": 29
},
{
"epoch": 0.07285974499089254,
"grad_norm": 0.412109375,
"learning_rate": 1.9985115029884556e-05,
"loss": 0.6465,
"step": 30
},
{
"epoch": 0.07528840315725562,
"grad_norm": 0.5546875,
"learning_rate": 1.9982913289244977e-05,
"loss": 0.7309,
"step": 31
},
{
"epoch": 0.0777170613236187,
"grad_norm": 0.396484375,
"learning_rate": 1.9980559883241723e-05,
"loss": 0.6319,
"step": 32
},
{
"epoch": 0.08014571948998178,
"grad_norm": 0.392578125,
"learning_rate": 1.9978054847628908e-05,
"loss": 0.6309,
"step": 33
},
{
"epoch": 0.08257437765634487,
"grad_norm": 0.392578125,
"learning_rate": 1.9975398220464268e-05,
"loss": 0.6301,
"step": 34
},
{
"epoch": 0.08500303582270795,
"grad_norm": 0.392578125,
"learning_rate": 1.9972590042108605e-05,
"loss": 0.6364,
"step": 35
},
{
"epoch": 0.08743169398907104,
"grad_norm": 0.400390625,
"learning_rate": 1.996963035522515e-05,
"loss": 0.6303,
"step": 36
},
{
"epoch": 0.08986035215543413,
"grad_norm": 0.384765625,
"learning_rate": 1.9966519204778937e-05,
"loss": 0.6374,
"step": 37
},
{
"epoch": 0.0922890103217972,
"grad_norm": 0.390625,
"learning_rate": 1.99632566380361e-05,
"loss": 0.6177,
"step": 38
},
{
"epoch": 0.0947176684881603,
"grad_norm": 0.38671875,
"learning_rate": 1.995984270456317e-05,
"loss": 0.6259,
"step": 39
},
{
"epoch": 0.09714632665452337,
"grad_norm": 0.380859375,
"learning_rate": 1.995627745622632e-05,
"loss": 0.6311,
"step": 40
},
{
"epoch": 0.09957498482088646,
"grad_norm": 0.39453125,
"learning_rate": 1.9952560947190568e-05,
"loss": 0.6254,
"step": 41
},
{
"epoch": 0.10200364298724955,
"grad_norm": 0.376953125,
"learning_rate": 1.994869323391895e-05,
"loss": 0.6197,
"step": 42
},
{
"epoch": 0.10443230115361263,
"grad_norm": 0.373046875,
"learning_rate": 1.9944674375171697e-05,
"loss": 0.6147,
"step": 43
},
{
"epoch": 0.10686095931997572,
"grad_norm": 0.380859375,
"learning_rate": 1.9940504432005293e-05,
"loss": 0.6281,
"step": 44
},
{
"epoch": 0.1092896174863388,
"grad_norm": 0.36328125,
"learning_rate": 1.993618346777158e-05,
"loss": 0.6142,
"step": 45
},
{
"epoch": 0.11171827565270188,
"grad_norm": 0.373046875,
"learning_rate": 1.993171154811679e-05,
"loss": 0.6182,
"step": 46
},
{
"epoch": 0.11414693381906496,
"grad_norm": 0.376953125,
"learning_rate": 1.992708874098054e-05,
"loss": 0.6181,
"step": 47
},
{
"epoch": 0.11657559198542805,
"grad_norm": 0.375,
"learning_rate": 1.992231511659481e-05,
"loss": 0.6136,
"step": 48
},
{
"epoch": 0.11900425015179114,
"grad_norm": 0.376953125,
"learning_rate": 1.9917390747482855e-05,
"loss": 0.6052,
"step": 49
},
{
"epoch": 0.12143290831815422,
"grad_norm": 0.369140625,
"learning_rate": 1.9912315708458144e-05,
"loss": 0.6087,
"step": 50
},
{
"epoch": 0.12386156648451731,
"grad_norm": 0.359375,
"learning_rate": 1.9907090076623174e-05,
"loss": 0.6031,
"step": 51
},
{
"epoch": 0.12629022465088038,
"grad_norm": 0.384765625,
"learning_rate": 1.9901713931368333e-05,
"loss": 0.6131,
"step": 52
},
{
"epoch": 0.12629022465088038,
"eval_loss": 0.612246572971344,
"eval_runtime": 97.1281,
"eval_samples_per_second": 30.887,
"eval_steps_per_second": 3.861,
"step": 52
},
{
"epoch": 0.12871888281724347,
"grad_norm": 0.455078125,
"learning_rate": 1.989618735437069e-05,
"loss": 0.702,
"step": 53
},
{
"epoch": 0.13114754098360656,
"grad_norm": 0.380859375,
"learning_rate": 1.989051042959273e-05,
"loss": 0.6192,
"step": 54
},
{
"epoch": 0.13357619914996965,
"grad_norm": 0.388671875,
"learning_rate": 1.9884683243281117e-05,
"loss": 0.612,
"step": 55
},
{
"epoch": 0.13600485731633272,
"grad_norm": 0.376953125,
"learning_rate": 1.9878705883965342e-05,
"loss": 0.6026,
"step": 56
},
{
"epoch": 0.1384335154826958,
"grad_norm": 0.380859375,
"learning_rate": 1.9872578442456415e-05,
"loss": 0.6044,
"step": 57
},
{
"epoch": 0.1408621736490589,
"grad_norm": 0.404296875,
"learning_rate": 1.986630101184546e-05,
"loss": 0.6061,
"step": 58
},
{
"epoch": 0.143290831815422,
"grad_norm": 0.390625,
"learning_rate": 1.9859873687502317e-05,
"loss": 0.6113,
"step": 59
},
{
"epoch": 0.14571948998178508,
"grad_norm": 0.384765625,
"learning_rate": 1.9853296567074075e-05,
"loss": 0.5933,
"step": 60
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.38671875,
"learning_rate": 1.9846569750483605e-05,
"loss": 0.6046,
"step": 61
},
{
"epoch": 0.15057680631451123,
"grad_norm": 0.3984375,
"learning_rate": 1.983969333992804e-05,
"loss": 0.6079,
"step": 62
},
{
"epoch": 0.15300546448087432,
"grad_norm": 0.392578125,
"learning_rate": 1.9832667439877217e-05,
"loss": 0.6098,
"step": 63
},
{
"epoch": 0.1554341226472374,
"grad_norm": 0.373046875,
"learning_rate": 1.982549215707209e-05,
"loss": 0.5942,
"step": 64
},
{
"epoch": 0.15786278081360047,
"grad_norm": 0.376953125,
"learning_rate": 1.98181676005231e-05,
"loss": 0.6082,
"step": 65
},
{
"epoch": 0.16029143897996356,
"grad_norm": 0.478515625,
"learning_rate": 1.9810693881508548e-05,
"loss": 0.6838,
"step": 66
},
{
"epoch": 0.16272009714632665,
"grad_norm": 0.390625,
"learning_rate": 1.980307111357288e-05,
"loss": 0.5919,
"step": 67
},
{
"epoch": 0.16514875531268974,
"grad_norm": 0.369140625,
"learning_rate": 1.9795299412524948e-05,
"loss": 0.5769,
"step": 68
},
{
"epoch": 0.16757741347905283,
"grad_norm": 0.3828125,
"learning_rate": 1.9787378896436292e-05,
"loss": 0.6,
"step": 69
},
{
"epoch": 0.1700060716454159,
"grad_norm": 0.369140625,
"learning_rate": 1.9779309685639317e-05,
"loss": 0.5963,
"step": 70
},
{
"epoch": 0.172434729811779,
"grad_norm": 0.373046875,
"learning_rate": 1.9771091902725465e-05,
"loss": 0.5954,
"step": 71
},
{
"epoch": 0.17486338797814208,
"grad_norm": 0.384765625,
"learning_rate": 1.9762725672543372e-05,
"loss": 0.5892,
"step": 72
},
{
"epoch": 0.17729204614450517,
"grad_norm": 0.376953125,
"learning_rate": 1.9754211122196945e-05,
"loss": 0.5883,
"step": 73
},
{
"epoch": 0.17972070431086826,
"grad_norm": 0.373046875,
"learning_rate": 1.9745548381043454e-05,
"loss": 0.5925,
"step": 74
},
{
"epoch": 0.18214936247723132,
"grad_norm": 0.39453125,
"learning_rate": 1.9736737580691553e-05,
"loss": 0.5867,
"step": 75
},
{
"epoch": 0.1845780206435944,
"grad_norm": 0.373046875,
"learning_rate": 1.9727778854999283e-05,
"loss": 0.5931,
"step": 76
},
{
"epoch": 0.1870066788099575,
"grad_norm": 0.384765625,
"learning_rate": 1.9718672340072044e-05,
"loss": 0.5858,
"step": 77
},
{
"epoch": 0.1894353369763206,
"grad_norm": 0.380859375,
"learning_rate": 1.9709418174260523e-05,
"loss": 0.5933,
"step": 78
},
{
"epoch": 0.1894353369763206,
"eval_loss": 0.5919594168663025,
"eval_runtime": 97.3358,
"eval_samples_per_second": 30.821,
"eval_steps_per_second": 3.853,
"step": 78
},
{
"epoch": 0.19186399514268368,
"grad_norm": 0.3671875,
"learning_rate": 1.970001649815859e-05,
"loss": 0.5753,
"step": 79
},
{
"epoch": 0.19429265330904674,
"grad_norm": 0.380859375,
"learning_rate": 1.969046745460116e-05,
"loss": 0.5892,
"step": 80
},
{
"epoch": 0.19672131147540983,
"grad_norm": 0.3828125,
"learning_rate": 1.9680771188662044e-05,
"loss": 0.5917,
"step": 81
},
{
"epoch": 0.19914996964177292,
"grad_norm": 0.38671875,
"learning_rate": 1.9670927847651707e-05,
"loss": 0.5913,
"step": 82
},
{
"epoch": 0.201578627808136,
"grad_norm": 0.37890625,
"learning_rate": 1.9660937581115073e-05,
"loss": 0.5787,
"step": 83
},
{
"epoch": 0.2040072859744991,
"grad_norm": 0.37109375,
"learning_rate": 1.9650800540829204e-05,
"loss": 0.5779,
"step": 84
},
{
"epoch": 0.20643594414086217,
"grad_norm": 0.376953125,
"learning_rate": 1.964051688080105e-05,
"loss": 0.5912,
"step": 85
},
{
"epoch": 0.20886460230722526,
"grad_norm": 0.380859375,
"learning_rate": 1.963008675726506e-05,
"loss": 0.5879,
"step": 86
},
{
"epoch": 0.21129326047358835,
"grad_norm": 0.3671875,
"learning_rate": 1.9619510328680847e-05,
"loss": 0.5905,
"step": 87
},
{
"epoch": 0.21372191863995144,
"grad_norm": 0.375,
"learning_rate": 1.9608787755730746e-05,
"loss": 0.5789,
"step": 88
},
{
"epoch": 0.2161505768063145,
"grad_norm": 0.37890625,
"learning_rate": 1.9597919201317393e-05,
"loss": 0.5824,
"step": 89
},
{
"epoch": 0.2185792349726776,
"grad_norm": 0.37109375,
"learning_rate": 1.958690483056126e-05,
"loss": 0.5841,
"step": 90
},
{
"epoch": 0.22100789313904068,
"grad_norm": 0.37109375,
"learning_rate": 1.9575744810798118e-05,
"loss": 0.5709,
"step": 91
},
{
"epoch": 0.22343655130540377,
"grad_norm": 0.369140625,
"learning_rate": 1.9564439311576515e-05,
"loss": 0.5799,
"step": 92
},
{
"epoch": 0.22586520947176686,
"grad_norm": 0.369140625,
"learning_rate": 1.9552988504655194e-05,
"loss": 0.5757,
"step": 93
},
{
"epoch": 0.22829386763812992,
"grad_norm": 0.365234375,
"learning_rate": 1.954139256400049e-05,
"loss": 0.5768,
"step": 94
},
{
"epoch": 0.230722525804493,
"grad_norm": 0.470703125,
"learning_rate": 1.9529651665783675e-05,
"loss": 0.6447,
"step": 95
},
{
"epoch": 0.2331511839708561,
"grad_norm": 0.375,
"learning_rate": 1.951776598837829e-05,
"loss": 0.5888,
"step": 96
},
{
"epoch": 0.2355798421372192,
"grad_norm": 0.4453125,
"learning_rate": 1.9505735712357437e-05,
"loss": 0.6567,
"step": 97
},
{
"epoch": 0.23800850030358228,
"grad_norm": 0.376953125,
"learning_rate": 1.9493561020491024e-05,
"loss": 0.5866,
"step": 98
},
{
"epoch": 0.24043715846994534,
"grad_norm": 0.376953125,
"learning_rate": 1.9481242097743002e-05,
"loss": 0.5775,
"step": 99
},
{
"epoch": 0.24286581663630843,
"grad_norm": 0.369140625,
"learning_rate": 1.9468779131268553e-05,
"loss": 0.5796,
"step": 100
},
{
"epoch": 0.24529447480267152,
"grad_norm": 0.375,
"learning_rate": 1.9456172310411228e-05,
"loss": 0.5763,
"step": 101
},
{
"epoch": 0.24772313296903462,
"grad_norm": 0.3828125,
"learning_rate": 1.9443421826700096e-05,
"loss": 0.5766,
"step": 102
},
{
"epoch": 0.2501517911353977,
"grad_norm": 0.373046875,
"learning_rate": 1.9430527873846826e-05,
"loss": 0.5766,
"step": 103
},
{
"epoch": 0.25258044930176077,
"grad_norm": 0.3671875,
"learning_rate": 1.9417490647742738e-05,
"loss": 0.5796,
"step": 104
},
{
"epoch": 0.25258044930176077,
"eval_loss": 0.5772241950035095,
"eval_runtime": 97.0571,
"eval_samples_per_second": 30.91,
"eval_steps_per_second": 3.864,
"step": 104
},
{
"epoch": 0.2550091074681239,
"grad_norm": 0.376953125,
"learning_rate": 1.9404310346455822e-05,
"loss": 0.5762,
"step": 105
},
{
"epoch": 0.25743776563448695,
"grad_norm": 0.3828125,
"learning_rate": 1.9390987170227746e-05,
"loss": 0.5833,
"step": 106
},
{
"epoch": 0.25986642380085,
"grad_norm": 0.37890625,
"learning_rate": 1.9377521321470806e-05,
"loss": 0.5739,
"step": 107
},
{
"epoch": 0.26229508196721313,
"grad_norm": 0.380859375,
"learning_rate": 1.9363913004764847e-05,
"loss": 0.5771,
"step": 108
},
{
"epoch": 0.2647237401335762,
"grad_norm": 0.361328125,
"learning_rate": 1.9350162426854152e-05,
"loss": 0.5674,
"step": 109
},
{
"epoch": 0.2671523982999393,
"grad_norm": 0.36328125,
"learning_rate": 1.9336269796644314e-05,
"loss": 0.5698,
"step": 110
},
{
"epoch": 0.26958105646630237,
"grad_norm": 0.376953125,
"learning_rate": 1.9322235325199054e-05,
"loss": 0.5681,
"step": 111
},
{
"epoch": 0.27200971463266543,
"grad_norm": 0.37109375,
"learning_rate": 1.9308059225737015e-05,
"loss": 0.5615,
"step": 112
},
{
"epoch": 0.27443837279902855,
"grad_norm": 0.373046875,
"learning_rate": 1.9293741713628518e-05,
"loss": 0.5765,
"step": 113
},
{
"epoch": 0.2768670309653916,
"grad_norm": 0.375,
"learning_rate": 1.9279283006392304e-05,
"loss": 0.5633,
"step": 114
},
{
"epoch": 0.27929568913175473,
"grad_norm": 0.37890625,
"learning_rate": 1.9264683323692213e-05,
"loss": 0.5629,
"step": 115
},
{
"epoch": 0.2817243472981178,
"grad_norm": 0.376953125,
"learning_rate": 1.924994288733386e-05,
"loss": 0.5707,
"step": 116
},
{
"epoch": 0.28415300546448086,
"grad_norm": 0.3828125,
"learning_rate": 1.9235061921261248e-05,
"loss": 0.5658,
"step": 117
},
{
"epoch": 0.286581663630844,
"grad_norm": 0.376953125,
"learning_rate": 1.9220040651553388e-05,
"loss": 0.5672,
"step": 118
},
{
"epoch": 0.28901032179720704,
"grad_norm": 0.427734375,
"learning_rate": 1.9204879306420852e-05,
"loss": 0.5644,
"step": 119
},
{
"epoch": 0.29143897996357016,
"grad_norm": 0.515625,
"learning_rate": 1.918957811620231e-05,
"loss": 0.658,
"step": 120
},
{
"epoch": 0.2938676381299332,
"grad_norm": 0.384765625,
"learning_rate": 1.9174137313361012e-05,
"loss": 0.5673,
"step": 121
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.419921875,
"learning_rate": 1.915855713248129e-05,
"loss": 0.5713,
"step": 122
},
{
"epoch": 0.2987249544626594,
"grad_norm": 0.376953125,
"learning_rate": 1.9142837810264972e-05,
"loss": 0.5605,
"step": 123
},
{
"epoch": 0.30115361262902246,
"grad_norm": 0.451171875,
"learning_rate": 1.912697958552778e-05,
"loss": 0.634,
"step": 124
},
{
"epoch": 0.3035822707953855,
"grad_norm": 0.39453125,
"learning_rate": 1.9110982699195724e-05,
"loss": 0.5743,
"step": 125
},
{
"epoch": 0.30601092896174864,
"grad_norm": 0.41015625,
"learning_rate": 1.9094847394301427e-05,
"loss": 0.5743,
"step": 126
},
{
"epoch": 0.3084395871281117,
"grad_norm": 0.388671875,
"learning_rate": 1.907857391598043e-05,
"loss": 0.5685,
"step": 127
},
{
"epoch": 0.3108682452944748,
"grad_norm": 0.37890625,
"learning_rate": 1.906216251146748e-05,
"loss": 0.5718,
"step": 128
},
{
"epoch": 0.3132969034608379,
"grad_norm": 0.392578125,
"learning_rate": 1.904561343009276e-05,
"loss": 0.5666,
"step": 129
},
{
"epoch": 0.31572556162720095,
"grad_norm": 0.380859375,
"learning_rate": 1.902892692327811e-05,
"loss": 0.5487,
"step": 130
},
{
"epoch": 0.31572556162720095,
"eval_loss": 0.565579354763031,
"eval_runtime": 96.8785,
"eval_samples_per_second": 30.967,
"eval_steps_per_second": 3.871,
"step": 130
},
{
"epoch": 0.31815421979356406,
"grad_norm": 0.400390625,
"learning_rate": 1.9012103244533217e-05,
"loss": 0.5662,
"step": 131
},
{
"epoch": 0.3205828779599271,
"grad_norm": 0.376953125,
"learning_rate": 1.899514264945173e-05,
"loss": 0.5692,
"step": 132
},
{
"epoch": 0.32301153612629024,
"grad_norm": 0.37109375,
"learning_rate": 1.897804539570742e-05,
"loss": 0.5571,
"step": 133
},
{
"epoch": 0.3254401942926533,
"grad_norm": 0.384765625,
"learning_rate": 1.8960811743050227e-05,
"loss": 0.553,
"step": 134
},
{
"epoch": 0.32786885245901637,
"grad_norm": 0.3828125,
"learning_rate": 1.8943441953302346e-05,
"loss": 0.5598,
"step": 135
},
{
"epoch": 0.3302975106253795,
"grad_norm": 0.388671875,
"learning_rate": 1.8925936290354224e-05,
"loss": 0.5624,
"step": 136
},
{
"epoch": 0.33272616879174255,
"grad_norm": 0.37109375,
"learning_rate": 1.890829502016056e-05,
"loss": 0.5597,
"step": 137
},
{
"epoch": 0.33515482695810567,
"grad_norm": 0.380859375,
"learning_rate": 1.8890518410736275e-05,
"loss": 0.5575,
"step": 138
},
{
"epoch": 0.33758348512446873,
"grad_norm": 0.37109375,
"learning_rate": 1.8872606732152426e-05,
"loss": 0.5575,
"step": 139
},
{
"epoch": 0.3400121432908318,
"grad_norm": 0.373046875,
"learning_rate": 1.8854560256532098e-05,
"loss": 0.5549,
"step": 140
},
{
"epoch": 0.3424408014571949,
"grad_norm": 0.388671875,
"learning_rate": 1.8836379258046298e-05,
"loss": 0.5671,
"step": 141
},
{
"epoch": 0.344869459623558,
"grad_norm": 0.546875,
"learning_rate": 1.8818064012909755e-05,
"loss": 0.639,
"step": 142
},
{
"epoch": 0.3472981177899211,
"grad_norm": 0.5078125,
"learning_rate": 1.8799614799376743e-05,
"loss": 0.6433,
"step": 143
},
{
"epoch": 0.34972677595628415,
"grad_norm": 0.380859375,
"learning_rate": 1.878103189773686e-05,
"loss": 0.5656,
"step": 144
},
{
"epoch": 0.3521554341226472,
"grad_norm": 0.404296875,
"learning_rate": 1.876231559031075e-05,
"loss": 0.5631,
"step": 145
},
{
"epoch": 0.35458409228901033,
"grad_norm": 0.3828125,
"learning_rate": 1.8743466161445823e-05,
"loss": 0.5563,
"step": 146
},
{
"epoch": 0.3570127504553734,
"grad_norm": 0.388671875,
"learning_rate": 1.872448389751194e-05,
"loss": 0.5569,
"step": 147
},
{
"epoch": 0.3594414086217365,
"grad_norm": 0.3671875,
"learning_rate": 1.8705369086897063e-05,
"loss": 0.5548,
"step": 148
},
{
"epoch": 0.3618700667880996,
"grad_norm": 0.39453125,
"learning_rate": 1.8686122020002857e-05,
"loss": 0.5587,
"step": 149
},
{
"epoch": 0.36429872495446264,
"grad_norm": 0.390625,
"learning_rate": 1.86667429892403e-05,
"loss": 0.5508,
"step": 150
},
{
"epoch": 0.36672738312082576,
"grad_norm": 0.373046875,
"learning_rate": 1.8647232289025223e-05,
"loss": 0.5594,
"step": 151
},
{
"epoch": 0.3691560412871888,
"grad_norm": 0.37890625,
"learning_rate": 1.862759021577385e-05,
"loss": 0.5579,
"step": 152
},
{
"epoch": 0.37158469945355194,
"grad_norm": 0.373046875,
"learning_rate": 1.860781706789829e-05,
"loss": 0.5503,
"step": 153
},
{
"epoch": 0.374013357619915,
"grad_norm": 0.373046875,
"learning_rate": 1.8587913145801998e-05,
"loss": 0.5601,
"step": 154
},
{
"epoch": 0.37644201578627806,
"grad_norm": 0.380859375,
"learning_rate": 1.8567878751875218e-05,
"loss": 0.5516,
"step": 155
},
{
"epoch": 0.3788706739526412,
"grad_norm": 0.365234375,
"learning_rate": 1.8547714190490385e-05,
"loss": 0.552,
"step": 156
},
{
"epoch": 0.3788706739526412,
"eval_loss": 0.5556911826133728,
"eval_runtime": 96.9623,
"eval_samples_per_second": 30.94,
"eval_steps_per_second": 3.867,
"step": 156
},
{
"epoch": 0.38129933211900424,
"grad_norm": 0.3828125,
"learning_rate": 1.8527419767997506e-05,
"loss": 0.5618,
"step": 157
},
{
"epoch": 0.38372799028536736,
"grad_norm": 0.390625,
"learning_rate": 1.8506995792719498e-05,
"loss": 0.5561,
"step": 158
},
{
"epoch": 0.3861566484517304,
"grad_norm": 0.3671875,
"learning_rate": 1.848644257494751e-05,
"loss": 0.5486,
"step": 159
},
{
"epoch": 0.3885853066180935,
"grad_norm": 0.369140625,
"learning_rate": 1.8465760426936212e-05,
"loss": 0.5521,
"step": 160
},
{
"epoch": 0.3910139647844566,
"grad_norm": 0.373046875,
"learning_rate": 1.8444949662899038e-05,
"loss": 0.5474,
"step": 161
},
{
"epoch": 0.39344262295081966,
"grad_norm": 0.37109375,
"learning_rate": 1.8424010599003424e-05,
"loss": 0.5508,
"step": 162
},
{
"epoch": 0.3958712811171828,
"grad_norm": 0.388671875,
"learning_rate": 1.8402943553365998e-05,
"loss": 0.5483,
"step": 163
},
{
"epoch": 0.39829993928354585,
"grad_norm": 0.369140625,
"learning_rate": 1.838174884604776e-05,
"loss": 0.5525,
"step": 164
},
{
"epoch": 0.4007285974499089,
"grad_norm": 0.376953125,
"learning_rate": 1.8360426799049197e-05,
"loss": 0.5512,
"step": 165
},
{
"epoch": 0.403157255616272,
"grad_norm": 0.369140625,
"learning_rate": 1.8338977736305408e-05,
"loss": 0.5509,
"step": 166
},
{
"epoch": 0.4055859137826351,
"grad_norm": 0.37890625,
"learning_rate": 1.831740198368118e-05,
"loss": 0.5403,
"step": 167
},
{
"epoch": 0.4080145719489982,
"grad_norm": 0.3671875,
"learning_rate": 1.8295699868966038e-05,
"loss": 0.5507,
"step": 168
},
{
"epoch": 0.41044323011536127,
"grad_norm": 0.376953125,
"learning_rate": 1.8273871721869256e-05,
"loss": 0.5354,
"step": 169
},
{
"epoch": 0.41287188828172433,
"grad_norm": 0.361328125,
"learning_rate": 1.8251917874014854e-05,
"loss": 0.5483,
"step": 170
},
{
"epoch": 0.41530054644808745,
"grad_norm": 0.373046875,
"learning_rate": 1.8229838658936566e-05,
"loss": 0.5416,
"step": 171
},
{
"epoch": 0.4177292046144505,
"grad_norm": 0.38671875,
"learning_rate": 1.8207634412072765e-05,
"loss": 0.5547,
"step": 172
},
{
"epoch": 0.4201578627808136,
"grad_norm": 0.384765625,
"learning_rate": 1.8185305470761366e-05,
"loss": 0.548,
"step": 173
},
{
"epoch": 0.4225865209471767,
"grad_norm": 0.59375,
"learning_rate": 1.8162852174234712e-05,
"loss": 0.6328,
"step": 174
},
{
"epoch": 0.42501517911353975,
"grad_norm": 0.37109375,
"learning_rate": 1.81402748636144e-05,
"loss": 0.5406,
"step": 175
},
{
"epoch": 0.42744383727990287,
"grad_norm": 0.37890625,
"learning_rate": 1.8117573881906114e-05,
"loss": 0.5446,
"step": 176
},
{
"epoch": 0.42987249544626593,
"grad_norm": 0.3828125,
"learning_rate": 1.809474957399442e-05,
"loss": 0.5591,
"step": 177
},
{
"epoch": 0.432301153612629,
"grad_norm": 0.376953125,
"learning_rate": 1.8071802286637505e-05,
"loss": 0.5415,
"step": 178
},
{
"epoch": 0.4347298117789921,
"grad_norm": 0.384765625,
"learning_rate": 1.8048732368461927e-05,
"loss": 0.5362,
"step": 179
},
{
"epoch": 0.4371584699453552,
"grad_norm": 0.37890625,
"learning_rate": 1.8025540169957315e-05,
"loss": 0.5464,
"step": 180
},
{
"epoch": 0.4395871281117183,
"grad_norm": 0.384765625,
"learning_rate": 1.8002226043471025e-05,
"loss": 0.544,
"step": 181
},
{
"epoch": 0.44201578627808136,
"grad_norm": 0.388671875,
"learning_rate": 1.7978790343202826e-05,
"loss": 0.5567,
"step": 182
},
{
"epoch": 0.44201578627808136,
"eval_loss": 0.5476920008659363,
"eval_runtime": 97.0095,
"eval_samples_per_second": 30.925,
"eval_steps_per_second": 3.866,
"step": 182
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.369140625,
"learning_rate": 1.795523342519948e-05,
"loss": 0.5349,
"step": 183
},
{
"epoch": 0.44687310261080754,
"grad_norm": 0.38671875,
"learning_rate": 1.7931555647349358e-05,
"loss": 0.5494,
"step": 184
},
{
"epoch": 0.4493017607771706,
"grad_norm": 0.37890625,
"learning_rate": 1.7907757369376984e-05,
"loss": 0.5431,
"step": 185
},
{
"epoch": 0.4517304189435337,
"grad_norm": 0.380859375,
"learning_rate": 1.7883838952837595e-05,
"loss": 0.5455,
"step": 186
},
{
"epoch": 0.4541590771098968,
"grad_norm": 0.40625,
"learning_rate": 1.785980076111161e-05,
"loss": 0.5475,
"step": 187
},
{
"epoch": 0.45658773527625984,
"grad_norm": 0.384765625,
"learning_rate": 1.7835643159399156e-05,
"loss": 0.5426,
"step": 188
},
{
"epoch": 0.45901639344262296,
"grad_norm": 0.384765625,
"learning_rate": 1.7811366514714475e-05,
"loss": 0.549,
"step": 189
},
{
"epoch": 0.461445051608986,
"grad_norm": 0.3671875,
"learning_rate": 1.778697119588039e-05,
"loss": 0.5409,
"step": 190
},
{
"epoch": 0.46387370977534914,
"grad_norm": 0.55078125,
"learning_rate": 1.7762457573522658e-05,
"loss": 0.6053,
"step": 191
},
{
"epoch": 0.4663023679417122,
"grad_norm": 0.375,
"learning_rate": 1.7737826020064377e-05,
"loss": 0.5487,
"step": 192
},
{
"epoch": 0.46873102610807527,
"grad_norm": 0.4140625,
"learning_rate": 1.771307690972031e-05,
"loss": 0.5347,
"step": 193
},
{
"epoch": 0.4711596842744384,
"grad_norm": 0.37109375,
"learning_rate": 1.76882106184912e-05,
"loss": 0.5525,
"step": 194
},
{
"epoch": 0.47358834244080145,
"grad_norm": 0.37890625,
"learning_rate": 1.7663227524158053e-05,
"loss": 0.5423,
"step": 195
},
{
"epoch": 0.47601700060716456,
"grad_norm": 0.38671875,
"learning_rate": 1.7638128006276422e-05,
"loss": 0.5526,
"step": 196
},
{
"epoch": 0.4784456587735276,
"grad_norm": 0.369140625,
"learning_rate": 1.7612912446170615e-05,
"loss": 0.5464,
"step": 197
},
{
"epoch": 0.4808743169398907,
"grad_norm": 0.69921875,
"learning_rate": 1.758758122692791e-05,
"loss": 0.6096,
"step": 198
},
{
"epoch": 0.4833029751062538,
"grad_norm": 0.384765625,
"learning_rate": 1.7562134733392736e-05,
"loss": 0.5399,
"step": 199
},
{
"epoch": 0.48573163327261687,
"grad_norm": 0.39453125,
"learning_rate": 1.753657335216083e-05,
"loss": 0.5503,
"step": 200
},
{
"epoch": 0.48816029143898,
"grad_norm": 0.373046875,
"learning_rate": 1.751089747157336e-05,
"loss": 0.5389,
"step": 201
},
{
"epoch": 0.49058894960534305,
"grad_norm": 0.3828125,
"learning_rate": 1.7485107481711014e-05,
"loss": 0.548,
"step": 202
},
{
"epoch": 0.4930176077717061,
"grad_norm": 0.412109375,
"learning_rate": 1.7459203774388097e-05,
"loss": 0.5404,
"step": 203
},
{
"epoch": 0.49544626593806923,
"grad_norm": 0.37890625,
"learning_rate": 1.743318674314656e-05,
"loss": 0.5497,
"step": 204
},
{
"epoch": 0.4978749241044323,
"grad_norm": 0.373046875,
"learning_rate": 1.740705678325004e-05,
"loss": 0.5313,
"step": 205
},
{
"epoch": 0.5003035822707954,
"grad_norm": 0.375,
"learning_rate": 1.7380814291677818e-05,
"loss": 0.5446,
"step": 206
},
{
"epoch": 0.5027322404371585,
"grad_norm": 0.50390625,
"learning_rate": 1.7354459667118825e-05,
"loss": 0.6115,
"step": 207
},
{
"epoch": 0.5051608986035215,
"grad_norm": 0.37109375,
"learning_rate": 1.7327993309965583e-05,
"loss": 0.5263,
"step": 208
},
{
"epoch": 0.5051608986035215,
"eval_loss": 0.5407972931861877,
"eval_runtime": 97.5769,
"eval_samples_per_second": 30.745,
"eval_steps_per_second": 3.843,
"step": 208
},
{
"epoch": 0.5075895567698846,
"grad_norm": 0.3671875,
"learning_rate": 1.730141562230809e-05,
"loss": 0.5454,
"step": 209
},
{
"epoch": 0.5100182149362478,
"grad_norm": 0.373046875,
"learning_rate": 1.7274727007927747e-05,
"loss": 0.5417,
"step": 210
},
{
"epoch": 0.5124468731026108,
"grad_norm": 0.365234375,
"learning_rate": 1.72479278722912e-05,
"loss": 0.5337,
"step": 211
},
{
"epoch": 0.5148755312689739,
"grad_norm": 0.373046875,
"learning_rate": 1.7221018622544197e-05,
"loss": 0.5477,
"step": 212
},
{
"epoch": 0.517304189435337,
"grad_norm": 0.373046875,
"learning_rate": 1.7193999667505387e-05,
"loss": 0.533,
"step": 213
},
{
"epoch": 0.5197328476017,
"grad_norm": 0.369140625,
"learning_rate": 1.7166871417660116e-05,
"loss": 0.5203,
"step": 214
},
{
"epoch": 0.5221615057680632,
"grad_norm": 0.373046875,
"learning_rate": 1.7139634285154198e-05,
"loss": 0.5326,
"step": 215
},
{
"epoch": 0.5245901639344263,
"grad_norm": 0.57421875,
"learning_rate": 1.7112288683787637e-05,
"loss": 0.6092,
"step": 216
},
{
"epoch": 0.5270188221007893,
"grad_norm": 0.3671875,
"learning_rate": 1.708483502900836e-05,
"loss": 0.5417,
"step": 217
},
{
"epoch": 0.5294474802671524,
"grad_norm": 0.373046875,
"learning_rate": 1.7057273737905887e-05,
"loss": 0.5347,
"step": 218
},
{
"epoch": 0.5318761384335154,
"grad_norm": 0.37890625,
"learning_rate": 1.7029605229205005e-05,
"loss": 0.523,
"step": 219
},
{
"epoch": 0.5343047965998786,
"grad_norm": 0.37890625,
"learning_rate": 1.70018299232594e-05,
"loss": 0.5363,
"step": 220
},
{
"epoch": 0.5367334547662417,
"grad_norm": 0.361328125,
"learning_rate": 1.6973948242045284e-05,
"loss": 0.5287,
"step": 221
},
{
"epoch": 0.5391621129326047,
"grad_norm": 0.37109375,
"learning_rate": 1.6945960609154966e-05,
"loss": 0.5396,
"step": 222
},
{
"epoch": 0.5415907710989678,
"grad_norm": 0.3828125,
"learning_rate": 1.6917867449790432e-05,
"loss": 0.5198,
"step": 223
},
{
"epoch": 0.5440194292653309,
"grad_norm": 0.44921875,
"learning_rate": 1.688966919075687e-05,
"loss": 0.6069,
"step": 224
},
{
"epoch": 0.546448087431694,
"grad_norm": 0.380859375,
"learning_rate": 1.68613662604562e-05,
"loss": 0.5376,
"step": 225
},
{
"epoch": 0.5488767455980571,
"grad_norm": 0.375,
"learning_rate": 1.6832959088880557e-05,
"loss": 0.5264,
"step": 226
},
{
"epoch": 0.5513054037644202,
"grad_norm": 0.369140625,
"learning_rate": 1.6804448107605767e-05,
"loss": 0.5369,
"step": 227
},
{
"epoch": 0.5537340619307832,
"grad_norm": 0.375,
"learning_rate": 1.677583374978478e-05,
"loss": 0.537,
"step": 228
},
{
"epoch": 0.5561627200971463,
"grad_norm": 0.380859375,
"learning_rate": 1.6747116450141092e-05,
"loss": 0.5257,
"step": 229
},
{
"epoch": 0.5585913782635095,
"grad_norm": 0.369140625,
"learning_rate": 1.6718296644962146e-05,
"loss": 0.532,
"step": 230
},
{
"epoch": 0.5610200364298725,
"grad_norm": 0.3671875,
"learning_rate": 1.6689374772092695e-05,
"loss": 0.5382,
"step": 231
},
{
"epoch": 0.5634486945962356,
"grad_norm": 0.373046875,
"learning_rate": 1.6660351270928164e-05,
"loss": 0.5313,
"step": 232
},
{
"epoch": 0.5658773527625987,
"grad_norm": 0.37109375,
"learning_rate": 1.6631226582407954e-05,
"loss": 0.5283,
"step": 233
},
{
"epoch": 0.5683060109289617,
"grad_norm": 0.361328125,
"learning_rate": 1.660200114900876e-05,
"loss": 0.5466,
"step": 234
},
{
"epoch": 0.5683060109289617,
"eval_loss": 0.5350908637046814,
"eval_runtime": 97.0805,
"eval_samples_per_second": 30.902,
"eval_steps_per_second": 3.863,
"step": 234
},
{
"epoch": 0.5707346690953249,
"grad_norm": 0.3671875,
"learning_rate": 1.6572675414737844e-05,
"loss": 0.5343,
"step": 235
},
{
"epoch": 0.573163327261688,
"grad_norm": 0.375,
"learning_rate": 1.6543249825126285e-05,
"loss": 0.5405,
"step": 236
},
{
"epoch": 0.575591985428051,
"grad_norm": 0.37109375,
"learning_rate": 1.6513724827222225e-05,
"loss": 0.5252,
"step": 237
},
{
"epoch": 0.5780206435944141,
"grad_norm": 0.365234375,
"learning_rate": 1.6484100869584044e-05,
"loss": 0.5295,
"step": 238
},
{
"epoch": 0.5804493017607771,
"grad_norm": 0.361328125,
"learning_rate": 1.645437840227359e-05,
"loss": 0.5331,
"step": 239
},
{
"epoch": 0.5828779599271403,
"grad_norm": 0.36328125,
"learning_rate": 1.6424557876849308e-05,
"loss": 0.5274,
"step": 240
},
{
"epoch": 0.5853066180935034,
"grad_norm": 0.373046875,
"learning_rate": 1.639463974635939e-05,
"loss": 0.5303,
"step": 241
},
{
"epoch": 0.5877352762598664,
"grad_norm": 0.369140625,
"learning_rate": 1.636462446533489e-05,
"loss": 0.5319,
"step": 242
},
{
"epoch": 0.5901639344262295,
"grad_norm": 0.369140625,
"learning_rate": 1.6334512489782833e-05,
"loss": 0.5316,
"step": 243
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.388671875,
"learning_rate": 1.6304304277179267e-05,
"loss": 0.5291,
"step": 244
},
{
"epoch": 0.5950212507589556,
"grad_norm": 0.373046875,
"learning_rate": 1.627400028646231e-05,
"loss": 0.5341,
"step": 245
},
{
"epoch": 0.5974499089253188,
"grad_norm": 0.37109375,
"learning_rate": 1.6243600978025215e-05,
"loss": 0.5233,
"step": 246
},
{
"epoch": 0.5998785670916819,
"grad_norm": 0.37109375,
"learning_rate": 1.6213106813709328e-05,
"loss": 0.5251,
"step": 247
},
{
"epoch": 0.6023072252580449,
"grad_norm": 0.376953125,
"learning_rate": 1.6182518256797095e-05,
"loss": 0.534,
"step": 248
},
{
"epoch": 0.604735883424408,
"grad_norm": 0.365234375,
"learning_rate": 1.6151835772005028e-05,
"loss": 0.5215,
"step": 249
},
{
"epoch": 0.607164541590771,
"grad_norm": 0.375,
"learning_rate": 1.612105982547663e-05,
"loss": 0.5391,
"step": 250
},
{
"epoch": 0.6095931997571342,
"grad_norm": 0.37109375,
"learning_rate": 1.6090190884775333e-05,
"loss": 0.5316,
"step": 251
},
{
"epoch": 0.6120218579234973,
"grad_norm": 0.3671875,
"learning_rate": 1.605922941887737e-05,
"loss": 0.5251,
"step": 252
},
{
"epoch": 0.6144505160898603,
"grad_norm": 0.359375,
"learning_rate": 1.6028175898164665e-05,
"loss": 0.5239,
"step": 253
},
{
"epoch": 0.6168791742562234,
"grad_norm": 0.3671875,
"learning_rate": 1.599703079441769e-05,
"loss": 0.5229,
"step": 254
},
{
"epoch": 0.6193078324225865,
"grad_norm": 0.3828125,
"learning_rate": 1.5965794580808292e-05,
"loss": 0.5311,
"step": 255
},
{
"epoch": 0.6217364905889496,
"grad_norm": 0.36328125,
"learning_rate": 1.5934467731892497e-05,
"loss": 0.5217,
"step": 256
},
{
"epoch": 0.6241651487553127,
"grad_norm": 0.365234375,
"learning_rate": 1.590305072360331e-05,
"loss": 0.5299,
"step": 257
},
{
"epoch": 0.6265938069216758,
"grad_norm": 0.375,
"learning_rate": 1.5871544033243488e-05,
"loss": 0.52,
"step": 258
},
{
"epoch": 0.6290224650880388,
"grad_norm": 0.369140625,
"learning_rate": 1.583994813947827e-05,
"loss": 0.5168,
"step": 259
},
{
"epoch": 0.6314511232544019,
"grad_norm": 0.494140625,
"learning_rate": 1.5808263522328137e-05,
"loss": 0.6037,
"step": 260
},
{
"epoch": 0.6314511232544019,
"eval_loss": 0.5299703478813171,
"eval_runtime": 96.9378,
"eval_samples_per_second": 30.948,
"eval_steps_per_second": 3.868,
"step": 260
},
{
"epoch": 0.6338797814207651,
"grad_norm": 0.365234375,
"learning_rate": 1.5776490663161474e-05,
"loss": 0.517,
"step": 261
},
{
"epoch": 0.6363084395871281,
"grad_norm": 0.357421875,
"learning_rate": 1.5744630044687307e-05,
"loss": 0.5182,
"step": 262
},
{
"epoch": 0.6387370977534912,
"grad_norm": 0.369140625,
"learning_rate": 1.5712682150947926e-05,
"loss": 0.5219,
"step": 263
},
{
"epoch": 0.6411657559198543,
"grad_norm": 0.373046875,
"learning_rate": 1.568064746731156e-05,
"loss": 0.5323,
"step": 264
},
{
"epoch": 0.6435944140862173,
"grad_norm": 0.447265625,
"learning_rate": 1.5648526480464995e-05,
"loss": 0.5902,
"step": 265
},
{
"epoch": 0.6460230722525805,
"grad_norm": 0.37109375,
"learning_rate": 1.561631967840617e-05,
"loss": 0.5374,
"step": 266
},
{
"epoch": 0.6484517304189436,
"grad_norm": 0.36328125,
"learning_rate": 1.558402755043677e-05,
"loss": 0.5145,
"step": 267
},
{
"epoch": 0.6508803885853066,
"grad_norm": 0.37109375,
"learning_rate": 1.5551650587154815e-05,
"loss": 0.5213,
"step": 268
},
{
"epoch": 0.6533090467516697,
"grad_norm": 0.361328125,
"learning_rate": 1.5519189280447153e-05,
"loss": 0.5192,
"step": 269
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.369140625,
"learning_rate": 1.5486644123482047e-05,
"loss": 0.5325,
"step": 270
},
{
"epoch": 0.6581663630843959,
"grad_norm": 0.375,
"learning_rate": 1.545401561070163e-05,
"loss": 0.5286,
"step": 271
},
{
"epoch": 0.660595021250759,
"grad_norm": 0.37890625,
"learning_rate": 1.542130423781444e-05,
"loss": 0.526,
"step": 272
},
{
"epoch": 0.663023679417122,
"grad_norm": 0.37109375,
"learning_rate": 1.5388510501787855e-05,
"loss": 0.5317,
"step": 273
},
{
"epoch": 0.6654523375834851,
"grad_norm": 0.361328125,
"learning_rate": 1.5355634900840558e-05,
"loss": 0.5204,
"step": 274
},
{
"epoch": 0.6678809957498482,
"grad_norm": 0.369140625,
"learning_rate": 1.5322677934434965e-05,
"loss": 0.5215,
"step": 275
},
{
"epoch": 0.6703096539162113,
"grad_norm": 0.36328125,
"learning_rate": 1.5289640103269626e-05,
"loss": 0.5247,
"step": 276
},
{
"epoch": 0.6727383120825744,
"grad_norm": 0.369140625,
"learning_rate": 1.5256521909271644e-05,
"loss": 0.5163,
"step": 277
},
{
"epoch": 0.6751669702489375,
"grad_norm": 0.36328125,
"learning_rate": 1.5223323855589027e-05,
"loss": 0.5335,
"step": 278
},
{
"epoch": 0.6775956284153005,
"grad_norm": 0.36328125,
"learning_rate": 1.519004644658305e-05,
"loss": 0.5199,
"step": 279
},
{
"epoch": 0.6800242865816636,
"grad_norm": 0.3671875,
"learning_rate": 1.5156690187820596e-05,
"loss": 0.5294,
"step": 280
},
{
"epoch": 0.6824529447480268,
"grad_norm": 0.3671875,
"learning_rate": 1.5123255586066467e-05,
"loss": 0.5248,
"step": 281
},
{
"epoch": 0.6848816029143898,
"grad_norm": 0.369140625,
"learning_rate": 1.50897431492757e-05,
"loss": 0.5261,
"step": 282
},
{
"epoch": 0.6873102610807529,
"grad_norm": 0.36328125,
"learning_rate": 1.5056153386585828e-05,
"loss": 0.5246,
"step": 283
},
{
"epoch": 0.689738919247116,
"grad_norm": 0.3671875,
"learning_rate": 1.5022486808309171e-05,
"loss": 0.518,
"step": 284
},
{
"epoch": 0.692167577413479,
"grad_norm": 0.373046875,
"learning_rate": 1.498874392592506e-05,
"loss": 0.5222,
"step": 285
},
{
"epoch": 0.6945962355798422,
"grad_norm": 0.36328125,
"learning_rate": 1.4954925252072077e-05,
"loss": 0.5333,
"step": 286
},
{
"epoch": 0.6945962355798422,
"eval_loss": 0.5256316661834717,
"eval_runtime": 97.1941,
"eval_samples_per_second": 30.866,
"eval_steps_per_second": 3.858,
"step": 286
},
{
"epoch": 0.6970248937462052,
"grad_norm": 0.37109375,
"learning_rate": 1.4921031300540268e-05,
"loss": 0.5385,
"step": 287
},
{
"epoch": 0.6994535519125683,
"grad_norm": 0.36328125,
"learning_rate": 1.4887062586263334e-05,
"loss": 0.5203,
"step": 288
},
{
"epoch": 0.7018822100789314,
"grad_norm": 0.3671875,
"learning_rate": 1.4853019625310813e-05,
"loss": 0.5163,
"step": 289
},
{
"epoch": 0.7043108682452944,
"grad_norm": 0.359375,
"learning_rate": 1.4818902934880222e-05,
"loss": 0.5211,
"step": 290
},
{
"epoch": 0.7067395264116576,
"grad_norm": 0.36328125,
"learning_rate": 1.4784713033289228e-05,
"loss": 0.5251,
"step": 291
},
{
"epoch": 0.7091681845780207,
"grad_norm": 0.4765625,
"learning_rate": 1.4750450439967751e-05,
"loss": 0.5817,
"step": 292
},
{
"epoch": 0.7115968427443837,
"grad_norm": 0.373046875,
"learning_rate": 1.4716115675450078e-05,
"loss": 0.5178,
"step": 293
},
{
"epoch": 0.7140255009107468,
"grad_norm": 0.3828125,
"learning_rate": 1.4681709261366963e-05,
"loss": 0.5317,
"step": 294
},
{
"epoch": 0.7164541590771099,
"grad_norm": 0.3671875,
"learning_rate": 1.4647231720437687e-05,
"loss": 0.535,
"step": 295
},
{
"epoch": 0.718882817243473,
"grad_norm": 0.376953125,
"learning_rate": 1.4612683576462135e-05,
"loss": 0.5263,
"step": 296
},
{
"epoch": 0.7213114754098361,
"grad_norm": 0.376953125,
"learning_rate": 1.4578065354312816e-05,
"loss": 0.5162,
"step": 297
},
{
"epoch": 0.7237401335761992,
"grad_norm": 0.369140625,
"learning_rate": 1.4543377579926915e-05,
"loss": 0.5262,
"step": 298
},
{
"epoch": 0.7261687917425622,
"grad_norm": 0.390625,
"learning_rate": 1.4508620780298288e-05,
"loss": 0.5242,
"step": 299
},
{
"epoch": 0.7285974499089253,
"grad_norm": 0.384765625,
"learning_rate": 1.4473795483469442e-05,
"loss": 0.5258,
"step": 300
},
{
"epoch": 0.7310261080752884,
"grad_norm": 0.515625,
"learning_rate": 1.4438902218523537e-05,
"loss": 0.5909,
"step": 301
},
{
"epoch": 0.7334547662416515,
"grad_norm": 0.375,
"learning_rate": 1.4403941515576344e-05,
"loss": 0.5213,
"step": 302
},
{
"epoch": 0.7358834244080146,
"grad_norm": 0.3828125,
"learning_rate": 1.4368913905768178e-05,
"loss": 0.5192,
"step": 303
},
{
"epoch": 0.7383120825743776,
"grad_norm": 0.55078125,
"learning_rate": 1.4333819921255836e-05,
"loss": 0.5678,
"step": 304
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.3671875,
"learning_rate": 1.4298660095204516e-05,
"loss": 0.5247,
"step": 305
},
{
"epoch": 0.7431693989071039,
"grad_norm": 0.380859375,
"learning_rate": 1.4263434961779709e-05,
"loss": 0.5291,
"step": 306
},
{
"epoch": 0.7455980570734669,
"grad_norm": 0.392578125,
"learning_rate": 1.4228145056139097e-05,
"loss": 0.5241,
"step": 307
},
{
"epoch": 0.74802671523983,
"grad_norm": 0.37109375,
"learning_rate": 1.41927909144244e-05,
"loss": 0.5199,
"step": 308
},
{
"epoch": 0.7504553734061931,
"grad_norm": 0.361328125,
"learning_rate": 1.4157373073753255e-05,
"loss": 0.5341,
"step": 309
},
{
"epoch": 0.7528840315725561,
"grad_norm": 0.375,
"learning_rate": 1.412189207221104e-05,
"loss": 0.5282,
"step": 310
},
{
"epoch": 0.7553126897389193,
"grad_norm": 0.376953125,
"learning_rate": 1.4086348448842707e-05,
"loss": 0.5194,
"step": 311
},
{
"epoch": 0.7577413479052824,
"grad_norm": 0.36328125,
"learning_rate": 1.4050742743644588e-05,
"loss": 0.5139,
"step": 312
},
{
"epoch": 0.7577413479052824,
"eval_loss": 0.5217667818069458,
"eval_runtime": 96.9922,
"eval_samples_per_second": 30.93,
"eval_steps_per_second": 3.866,
"step": 312
},
{
"epoch": 0.7601700060716454,
"grad_norm": 0.37109375,
"learning_rate": 1.4015075497556193e-05,
"loss": 0.5176,
"step": 313
},
{
"epoch": 0.7625986642380085,
"grad_norm": 0.38671875,
"learning_rate": 1.3979347252451994e-05,
"loss": 0.5178,
"step": 314
},
{
"epoch": 0.7650273224043715,
"grad_norm": 0.3828125,
"learning_rate": 1.3943558551133186e-05,
"loss": 0.5258,
"step": 315
},
{
"epoch": 0.7674559805707347,
"grad_norm": 0.3671875,
"learning_rate": 1.3907709937319451e-05,
"loss": 0.5176,
"step": 316
},
{
"epoch": 0.7698846387370978,
"grad_norm": 0.625,
"learning_rate": 1.3871801955640682e-05,
"loss": 0.5865,
"step": 317
},
{
"epoch": 0.7723132969034608,
"grad_norm": 0.380859375,
"learning_rate": 1.3835835151628728e-05,
"loss": 0.5194,
"step": 318
},
{
"epoch": 0.7747419550698239,
"grad_norm": 0.396484375,
"learning_rate": 1.3799810071709088e-05,
"loss": 0.5213,
"step": 319
},
{
"epoch": 0.777170613236187,
"grad_norm": 0.37890625,
"learning_rate": 1.3763727263192626e-05,
"loss": 0.5276,
"step": 320
},
{
"epoch": 0.7795992714025501,
"grad_norm": 0.37890625,
"learning_rate": 1.3727587274267235e-05,
"loss": 0.5214,
"step": 321
},
{
"epoch": 0.7820279295689132,
"grad_norm": 0.384765625,
"learning_rate": 1.3691390653989536e-05,
"loss": 0.5307,
"step": 322
},
{
"epoch": 0.7844565877352763,
"grad_norm": 0.37890625,
"learning_rate": 1.365513795227651e-05,
"loss": 0.5252,
"step": 323
},
{
"epoch": 0.7868852459016393,
"grad_norm": 0.359375,
"learning_rate": 1.3618829719897158e-05,
"loss": 0.5186,
"step": 324
},
{
"epoch": 0.7893139040680024,
"grad_norm": 0.384765625,
"learning_rate": 1.3582466508464132e-05,
"loss": 0.5191,
"step": 325
},
{
"epoch": 0.7917425622343656,
"grad_norm": 0.3671875,
"learning_rate": 1.3546048870425356e-05,
"loss": 0.5268,
"step": 326
},
{
"epoch": 0.7941712204007286,
"grad_norm": 0.376953125,
"learning_rate": 1.3509577359055627e-05,
"loss": 0.53,
"step": 327
},
{
"epoch": 0.7965998785670917,
"grad_norm": 0.3671875,
"learning_rate": 1.3473052528448203e-05,
"loss": 0.5142,
"step": 328
},
{
"epoch": 0.7990285367334548,
"grad_norm": 0.384765625,
"learning_rate": 1.3436474933506412e-05,
"loss": 0.5148,
"step": 329
},
{
"epoch": 0.8014571948998178,
"grad_norm": 0.37109375,
"learning_rate": 1.3399845129935191e-05,
"loss": 0.5223,
"step": 330
},
{
"epoch": 0.803885853066181,
"grad_norm": 0.361328125,
"learning_rate": 1.3363163674232663e-05,
"loss": 0.5247,
"step": 331
},
{
"epoch": 0.806314511232544,
"grad_norm": 0.365234375,
"learning_rate": 1.3326431123681667e-05,
"loss": 0.52,
"step": 332
},
{
"epoch": 0.8087431693989071,
"grad_norm": 0.373046875,
"learning_rate": 1.328964803634131e-05,
"loss": 0.5172,
"step": 333
},
{
"epoch": 0.8111718275652702,
"grad_norm": 0.37890625,
"learning_rate": 1.3252814971038477e-05,
"loss": 0.5226,
"step": 334
},
{
"epoch": 0.8136004857316332,
"grad_norm": 0.369140625,
"learning_rate": 1.3215932487359338e-05,
"loss": 0.5214,
"step": 335
},
{
"epoch": 0.8160291438979964,
"grad_norm": 0.375,
"learning_rate": 1.3179001145640856e-05,
"loss": 0.5234,
"step": 336
},
{
"epoch": 0.8184578020643595,
"grad_norm": 0.39453125,
"learning_rate": 1.314202150696227e-05,
"loss": 0.5195,
"step": 337
},
{
"epoch": 0.8208864602307225,
"grad_norm": 0.359375,
"learning_rate": 1.3104994133136563e-05,
"loss": 0.5212,
"step": 338
},
{
"epoch": 0.8208864602307225,
"eval_loss": 0.5185486674308777,
"eval_runtime": 97.0358,
"eval_samples_per_second": 30.916,
"eval_steps_per_second": 3.865,
"step": 338
},
{
"epoch": 0.8233151183970856,
"grad_norm": 0.369140625,
"learning_rate": 1.3067919586701948e-05,
"loss": 0.5108,
"step": 339
},
{
"epoch": 0.8257437765634487,
"grad_norm": 0.37890625,
"learning_rate": 1.3030798430913289e-05,
"loss": 0.5175,
"step": 340
},
{
"epoch": 0.8281724347298117,
"grad_norm": 0.373046875,
"learning_rate": 1.2993631229733584e-05,
"loss": 0.5165,
"step": 341
},
{
"epoch": 0.8306010928961749,
"grad_norm": 0.369140625,
"learning_rate": 1.295641854782535e-05,
"loss": 0.5096,
"step": 342
},
{
"epoch": 0.833029751062538,
"grad_norm": 0.37109375,
"learning_rate": 1.2919160950542095e-05,
"loss": 0.5231,
"step": 343
},
{
"epoch": 0.835458409228901,
"grad_norm": 0.373046875,
"learning_rate": 1.2881859003919688e-05,
"loss": 0.512,
"step": 344
},
{
"epoch": 0.8378870673952641,
"grad_norm": 0.36328125,
"learning_rate": 1.284451327466778e-05,
"loss": 0.5081,
"step": 345
},
{
"epoch": 0.8403157255616271,
"grad_norm": 0.369140625,
"learning_rate": 1.2807124330161188e-05,
"loss": 0.5181,
"step": 346
},
{
"epoch": 0.8427443837279903,
"grad_norm": 0.36328125,
"learning_rate": 1.2769692738431279e-05,
"loss": 0.5191,
"step": 347
},
{
"epoch": 0.8451730418943534,
"grad_norm": 0.357421875,
"learning_rate": 1.2732219068157335e-05,
"loss": 0.499,
"step": 348
},
{
"epoch": 0.8476017000607164,
"grad_norm": 0.3828125,
"learning_rate": 1.2694703888657915e-05,
"loss": 0.5205,
"step": 349
},
{
"epoch": 0.8500303582270795,
"grad_norm": 0.5390625,
"learning_rate": 1.2657147769882215e-05,
"loss": 0.5799,
"step": 350
},
{
"epoch": 0.8524590163934426,
"grad_norm": 0.361328125,
"learning_rate": 1.261955128240139e-05,
"loss": 0.5102,
"step": 351
},
{
"epoch": 0.8548876745598057,
"grad_norm": 0.36328125,
"learning_rate": 1.2581914997399899e-05,
"loss": 0.514,
"step": 352
},
{
"epoch": 0.8573163327261688,
"grad_norm": 0.369140625,
"learning_rate": 1.2544239486666831e-05,
"loss": 0.5168,
"step": 353
},
{
"epoch": 0.8597449908925319,
"grad_norm": 0.392578125,
"learning_rate": 1.2506525322587207e-05,
"loss": 0.5138,
"step": 354
},
{
"epoch": 0.8621736490588949,
"grad_norm": 0.53125,
"learning_rate": 1.2468773078133286e-05,
"loss": 0.563,
"step": 355
},
{
"epoch": 0.864602307225258,
"grad_norm": 0.365234375,
"learning_rate": 1.2430983326855873e-05,
"loss": 0.5064,
"step": 356
},
{
"epoch": 0.8670309653916212,
"grad_norm": 0.369140625,
"learning_rate": 1.2393156642875579e-05,
"loss": 0.5148,
"step": 357
},
{
"epoch": 0.8694596235579842,
"grad_norm": 0.380859375,
"learning_rate": 1.2355293600874132e-05,
"loss": 0.5147,
"step": 358
},
{
"epoch": 0.8718882817243473,
"grad_norm": 0.376953125,
"learning_rate": 1.2317394776085614e-05,
"loss": 0.5164,
"step": 359
},
{
"epoch": 0.8743169398907104,
"grad_norm": 0.37109375,
"learning_rate": 1.2279460744287755e-05,
"loss": 0.5109,
"step": 360
},
{
"epoch": 0.8767455980570734,
"grad_norm": 0.375,
"learning_rate": 1.2241492081793145e-05,
"loss": 0.5184,
"step": 361
},
{
"epoch": 0.8791742562234366,
"grad_norm": 0.56640625,
"learning_rate": 1.220348936544052e-05,
"loss": 0.5627,
"step": 362
},
{
"epoch": 0.8816029143897997,
"grad_norm": 0.361328125,
"learning_rate": 1.2165453172585964e-05,
"loss": 0.5149,
"step": 363
},
{
"epoch": 0.8840315725561627,
"grad_norm": 0.388671875,
"learning_rate": 1.2127384081094167e-05,
"loss": 0.5109,
"step": 364
},
{
"epoch": 0.8840315725561627,
"eval_loss": 0.5158221125602722,
"eval_runtime": 97.2582,
"eval_samples_per_second": 30.846,
"eval_steps_per_second": 3.856,
"step": 364
},
{
"epoch": 0.8864602307225258,
"grad_norm": 0.37890625,
"learning_rate": 1.2089282669329625e-05,
"loss": 0.4993,
"step": 365
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.373046875,
"learning_rate": 1.205114951614785e-05,
"loss": 0.5187,
"step": 366
},
{
"epoch": 0.891317547055252,
"grad_norm": 0.357421875,
"learning_rate": 1.2012985200886602e-05,
"loss": 0.5088,
"step": 367
},
{
"epoch": 0.8937462052216151,
"grad_norm": 0.380859375,
"learning_rate": 1.197479030335706e-05,
"loss": 0.5167,
"step": 368
},
{
"epoch": 0.8961748633879781,
"grad_norm": 0.384765625,
"learning_rate": 1.1936565403835027e-05,
"loss": 0.5138,
"step": 369
},
{
"epoch": 0.8986035215543412,
"grad_norm": 0.369140625,
"learning_rate": 1.1898311083052113e-05,
"loss": 0.5062,
"step": 370
},
{
"epoch": 0.9010321797207043,
"grad_norm": 0.361328125,
"learning_rate": 1.1860027922186908e-05,
"loss": 0.5122,
"step": 371
},
{
"epoch": 0.9034608378870674,
"grad_norm": 0.361328125,
"learning_rate": 1.1821716502856154e-05,
"loss": 0.5108,
"step": 372
},
{
"epoch": 0.9058894960534305,
"grad_norm": 0.373046875,
"learning_rate": 1.1783377407105907e-05,
"loss": 0.5212,
"step": 373
},
{
"epoch": 0.9083181542197936,
"grad_norm": 0.3828125,
"learning_rate": 1.1745011217402709e-05,
"loss": 0.5079,
"step": 374
},
{
"epoch": 0.9107468123861566,
"grad_norm": 0.380859375,
"learning_rate": 1.1706618516624712e-05,
"loss": 0.5105,
"step": 375
},
{
"epoch": 0.9131754705525197,
"grad_norm": 0.36328125,
"learning_rate": 1.1668199888052844e-05,
"loss": 0.5123,
"step": 376
},
{
"epoch": 0.9156041287188829,
"grad_norm": 0.365234375,
"learning_rate": 1.1629755915361947e-05,
"loss": 0.5125,
"step": 377
},
{
"epoch": 0.9180327868852459,
"grad_norm": 0.373046875,
"learning_rate": 1.159128718261189e-05,
"loss": 0.5021,
"step": 378
},
{
"epoch": 0.920461445051609,
"grad_norm": 0.359375,
"learning_rate": 1.1552794274238723e-05,
"loss": 0.5158,
"step": 379
},
{
"epoch": 0.922890103217972,
"grad_norm": 0.361328125,
"learning_rate": 1.1514277775045768e-05,
"loss": 0.5064,
"step": 380
},
{
"epoch": 0.9253187613843351,
"grad_norm": 0.369140625,
"learning_rate": 1.1475738270194767e-05,
"loss": 0.512,
"step": 381
},
{
"epoch": 0.9277474195506983,
"grad_norm": 0.365234375,
"learning_rate": 1.1437176345196967e-05,
"loss": 0.5236,
"step": 382
},
{
"epoch": 0.9301760777170613,
"grad_norm": 0.361328125,
"learning_rate": 1.1398592585904234e-05,
"loss": 0.5152,
"step": 383
},
{
"epoch": 0.9326047358834244,
"grad_norm": 0.35546875,
"learning_rate": 1.135998757850015e-05,
"loss": 0.522,
"step": 384
},
{
"epoch": 0.9350333940497875,
"grad_norm": 0.392578125,
"learning_rate": 1.1321361909491108e-05,
"loss": 0.5159,
"step": 385
},
{
"epoch": 0.9374620522161505,
"grad_norm": 0.369140625,
"learning_rate": 1.128271616569741e-05,
"loss": 0.5042,
"step": 386
},
{
"epoch": 0.9398907103825137,
"grad_norm": 0.361328125,
"learning_rate": 1.1244050934244333e-05,
"loss": 0.5161,
"step": 387
},
{
"epoch": 0.9423193685488768,
"grad_norm": 0.357421875,
"learning_rate": 1.1205366802553231e-05,
"loss": 0.5094,
"step": 388
},
{
"epoch": 0.9447480267152398,
"grad_norm": 0.365234375,
"learning_rate": 1.1166664358332595e-05,
"loss": 0.5165,
"step": 389
},
{
"epoch": 0.9471766848816029,
"grad_norm": 0.36328125,
"learning_rate": 1.1127944189569122e-05,
"loss": 0.5148,
"step": 390
},
{
"epoch": 0.9471766848816029,
"eval_loss": 0.5134184956550598,
"eval_runtime": 97.3787,
"eval_samples_per_second": 30.808,
"eval_steps_per_second": 3.851,
"step": 390
},
{
"epoch": 0.949605343047966,
"grad_norm": 0.375,
"learning_rate": 1.1089206884518802e-05,
"loss": 0.52,
"step": 391
},
{
"epoch": 0.9520340012143291,
"grad_norm": 0.361328125,
"learning_rate": 1.1050453031697958e-05,
"loss": 0.5141,
"step": 392
},
{
"epoch": 0.9544626593806922,
"grad_norm": 0.36328125,
"learning_rate": 1.1011683219874324e-05,
"loss": 0.5114,
"step": 393
},
{
"epoch": 0.9568913175470553,
"grad_norm": 0.35546875,
"learning_rate": 1.0972898038058077e-05,
"loss": 0.5128,
"step": 394
},
{
"epoch": 0.9593199757134183,
"grad_norm": 0.36328125,
"learning_rate": 1.093409807549292e-05,
"loss": 0.5107,
"step": 395
},
{
"epoch": 0.9617486338797814,
"grad_norm": 0.74609375,
"learning_rate": 1.0895283921647098e-05,
"loss": 0.5607,
"step": 396
},
{
"epoch": 0.9641772920461446,
"grad_norm": 0.36328125,
"learning_rate": 1.085645616620446e-05,
"loss": 0.5203,
"step": 397
},
{
"epoch": 0.9666059502125076,
"grad_norm": 0.359375,
"learning_rate": 1.0817615399055513e-05,
"loss": 0.511,
"step": 398
},
{
"epoch": 0.9690346083788707,
"grad_norm": 0.365234375,
"learning_rate": 1.0778762210288416e-05,
"loss": 0.5017,
"step": 399
},
{
"epoch": 0.9714632665452337,
"grad_norm": 0.359375,
"learning_rate": 1.0739897190180066e-05,
"loss": 0.5149,
"step": 400
},
{
"epoch": 0.9738919247115968,
"grad_norm": 0.57421875,
"learning_rate": 1.0701020929187096e-05,
"loss": 0.5721,
"step": 401
},
{
"epoch": 0.97632058287796,
"grad_norm": 0.36328125,
"learning_rate": 1.0662134017936924e-05,
"loss": 0.5081,
"step": 402
},
{
"epoch": 0.978749241044323,
"grad_norm": 0.54296875,
"learning_rate": 1.0623237047218771e-05,
"loss": 0.5709,
"step": 403
},
{
"epoch": 0.9811778992106861,
"grad_norm": 0.357421875,
"learning_rate": 1.0584330607974673e-05,
"loss": 0.5015,
"step": 404
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.359375,
"learning_rate": 1.054541529129054e-05,
"loss": 0.5167,
"step": 405
},
{
"epoch": 0.9860352155434122,
"grad_norm": 0.53515625,
"learning_rate": 1.0506491688387128e-05,
"loss": 0.5619,
"step": 406
},
{
"epoch": 0.9884638737097754,
"grad_norm": 0.365234375,
"learning_rate": 1.04675603906111e-05,
"loss": 0.5261,
"step": 407
},
{
"epoch": 0.9908925318761385,
"grad_norm": 0.359375,
"learning_rate": 1.0428621989426016e-05,
"loss": 0.4998,
"step": 408
},
{
"epoch": 0.9933211900425015,
"grad_norm": 0.37109375,
"learning_rate": 1.0389677076403351e-05,
"loss": 0.5051,
"step": 409
},
{
"epoch": 0.9957498482088646,
"grad_norm": 0.5390625,
"learning_rate": 1.0350726243213519e-05,
"loss": 0.569,
"step": 410
},
{
"epoch": 0.9981785063752276,
"grad_norm": 0.361328125,
"learning_rate": 1.0311770081616864e-05,
"loss": 0.514,
"step": 411
},
{
"epoch": 1.0006071645415908,
"grad_norm": 0.359375,
"learning_rate": 1.0272809183454701e-05,
"loss": 0.5084,
"step": 412
},
{
"epoch": 1.0030358227079539,
"grad_norm": 0.51171875,
"learning_rate": 1.0233844140640287e-05,
"loss": 0.5605,
"step": 413
},
{
"epoch": 1.005464480874317,
"grad_norm": 0.361328125,
"learning_rate": 1.0194875545149854e-05,
"loss": 0.507,
"step": 414
},
{
"epoch": 1.00789313904068,
"grad_norm": 0.36328125,
"learning_rate": 1.015590398901361e-05,
"loss": 0.5133,
"step": 415
},
{
"epoch": 1.010321797207043,
"grad_norm": 0.37109375,
"learning_rate": 1.0116930064306736e-05,
"loss": 0.5121,
"step": 416
},
{
"epoch": 1.010321797207043,
"eval_loss": 0.5115101933479309,
"eval_runtime": 96.8252,
"eval_samples_per_second": 30.984,
"eval_steps_per_second": 3.873,
"step": 416
},
{
"epoch": 1.0127504553734061,
"grad_norm": 0.357421875,
"learning_rate": 1.0077954363140407e-05,
"loss": 0.5109,
"step": 417
},
{
"epoch": 1.0151791135397692,
"grad_norm": 0.353515625,
"learning_rate": 1.0038977477652779e-05,
"loss": 0.4991,
"step": 418
},
{
"epoch": 1.002428658166363,
"grad_norm": 0.3984375,
"learning_rate": 1e-05,
"loss": 0.4774,
"step": 419
},
{
"epoch": 1.0048573163327261,
"grad_norm": 0.384765625,
"learning_rate": 9.961022522347226e-06,
"loss": 0.475,
"step": 420
},
{
"epoch": 1.0072859744990892,
"grad_norm": 0.365234375,
"learning_rate": 9.922045636859596e-06,
"loss": 0.4863,
"step": 421
},
{
"epoch": 1.0097146326654522,
"grad_norm": 0.373046875,
"learning_rate": 9.883069935693267e-06,
"loss": 0.4837,
"step": 422
},
{
"epoch": 1.0121432908318153,
"grad_norm": 0.38671875,
"learning_rate": 9.844096010986392e-06,
"loss": 0.479,
"step": 423
},
{
"epoch": 1.0145719489981786,
"grad_norm": 0.380859375,
"learning_rate": 9.80512445485015e-06,
"loss": 0.4849,
"step": 424
},
{
"epoch": 1.0170006071645417,
"grad_norm": 0.3828125,
"learning_rate": 9.766155859359718e-06,
"loss": 0.4765,
"step": 425
},
{
"epoch": 1.0194292653309047,
"grad_norm": 0.361328125,
"learning_rate": 9.7271908165453e-06,
"loss": 0.4773,
"step": 426
},
{
"epoch": 1.0218579234972678,
"grad_norm": 0.64453125,
"learning_rate": 9.688229918383138e-06,
"loss": 0.5238,
"step": 427
},
{
"epoch": 1.0242865816636308,
"grad_norm": 0.359375,
"learning_rate": 9.649273756786486e-06,
"loss": 0.483,
"step": 428
},
{
"epoch": 1.026715239829994,
"grad_norm": 0.361328125,
"learning_rate": 9.610322923596652e-06,
"loss": 0.4718,
"step": 429
},
{
"epoch": 1.029143897996357,
"grad_norm": 0.546875,
"learning_rate": 9.57137801057399e-06,
"loss": 0.5207,
"step": 430
},
{
"epoch": 1.03157255616272,
"grad_norm": 0.35546875,
"learning_rate": 9.532439609388901e-06,
"loss": 0.4787,
"step": 431
},
{
"epoch": 1.034001214329083,
"grad_norm": 0.3515625,
"learning_rate": 9.493508311612874e-06,
"loss": 0.4768,
"step": 432
},
{
"epoch": 1.0364298724954462,
"grad_norm": 0.359375,
"learning_rate": 9.454584708709462e-06,
"loss": 0.484,
"step": 433
},
{
"epoch": 1.0388585306618094,
"grad_norm": 0.361328125,
"learning_rate": 9.415669392025329e-06,
"loss": 0.4812,
"step": 434
},
{
"epoch": 1.0412871888281725,
"grad_norm": 0.353515625,
"learning_rate": 9.376762952781234e-06,
"loss": 0.475,
"step": 435
},
{
"epoch": 1.0437158469945356,
"grad_norm": 0.35546875,
"learning_rate": 9.337865982063076e-06,
"loss": 0.4726,
"step": 436
},
{
"epoch": 1.0461445051608986,
"grad_norm": 0.361328125,
"learning_rate": 9.298979070812908e-06,
"loss": 0.473,
"step": 437
},
{
"epoch": 1.0485731633272617,
"grad_norm": 0.353515625,
"learning_rate": 9.260102809819939e-06,
"loss": 0.4739,
"step": 438
},
{
"epoch": 1.0510018214936248,
"grad_norm": 0.361328125,
"learning_rate": 9.221237789711587e-06,
"loss": 0.4916,
"step": 439
},
{
"epoch": 1.0534304796599878,
"grad_norm": 0.35546875,
"learning_rate": 9.182384600944494e-06,
"loss": 0.4823,
"step": 440
},
{
"epoch": 1.0558591378263509,
"grad_norm": 0.349609375,
"learning_rate": 9.143543833795539e-06,
"loss": 0.4737,
"step": 441
},
{
"epoch": 1.058287795992714,
"grad_norm": 0.361328125,
"learning_rate": 9.104716078352906e-06,
"loss": 0.4788,
"step": 442
},
{
"epoch": 1.058287795992714,
"eval_loss": 0.5110214352607727,
"eval_runtime": 96.9242,
"eval_samples_per_second": 30.952,
"eval_steps_per_second": 3.869,
"step": 442
},
{
"epoch": 1.060716454159077,
"grad_norm": 0.365234375,
"learning_rate": 9.065901924507085e-06,
"loss": 0.4775,
"step": 443
},
{
"epoch": 1.0631451123254403,
"grad_norm": 0.35546875,
"learning_rate": 9.027101961941925e-06,
"loss": 0.4735,
"step": 444
},
{
"epoch": 1.0655737704918034,
"grad_norm": 0.353515625,
"learning_rate": 8.98831678012568e-06,
"loss": 0.4803,
"step": 445
},
{
"epoch": 1.0680024286581664,
"grad_norm": 0.353515625,
"learning_rate": 8.949546968302042e-06,
"loss": 0.4767,
"step": 446
},
{
"epoch": 1.0704310868245295,
"grad_norm": 0.36328125,
"learning_rate": 8.910793115481201e-06,
"loss": 0.4765,
"step": 447
},
{
"epoch": 1.0728597449908925,
"grad_norm": 0.357421875,
"learning_rate": 8.872055810430881e-06,
"loss": 0.4789,
"step": 448
},
{
"epoch": 1.0752884031572556,
"grad_norm": 0.671875,
"learning_rate": 8.833335641667408e-06,
"loss": 0.5243,
"step": 449
},
{
"epoch": 1.0777170613236187,
"grad_norm": 0.3515625,
"learning_rate": 8.79463319744677e-06,
"loss": 0.4769,
"step": 450
},
{
"epoch": 1.0801457194899817,
"grad_norm": 0.359375,
"learning_rate": 8.755949065755668e-06,
"loss": 0.4774,
"step": 451
},
{
"epoch": 1.0825743776563448,
"grad_norm": 0.373046875,
"learning_rate": 8.717283834302593e-06,
"loss": 0.4792,
"step": 452
},
{
"epoch": 1.0850030358227079,
"grad_norm": 0.353515625,
"learning_rate": 8.678638090508897e-06,
"loss": 0.4768,
"step": 453
},
{
"epoch": 1.0874316939890711,
"grad_norm": 0.35546875,
"learning_rate": 8.640012421499856e-06,
"loss": 0.4738,
"step": 454
},
{
"epoch": 1.0898603521554342,
"grad_norm": 0.5625,
"learning_rate": 8.601407414095771e-06,
"loss": 0.5251,
"step": 455
},
{
"epoch": 1.0922890103217973,
"grad_norm": 0.35546875,
"learning_rate": 8.562823654803035e-06,
"loss": 0.4847,
"step": 456
},
{
"epoch": 1.0947176684881603,
"grad_norm": 0.359375,
"learning_rate": 8.524261729805235e-06,
"loss": 0.4815,
"step": 457
},
{
"epoch": 1.0971463266545234,
"grad_norm": 0.7421875,
"learning_rate": 8.485722224954237e-06,
"loss": 0.5586,
"step": 458
},
{
"epoch": 1.0995749848208864,
"grad_norm": 0.353515625,
"learning_rate": 8.44720572576128e-06,
"loss": 0.4716,
"step": 459
},
{
"epoch": 1.1020036429872495,
"grad_norm": 0.35546875,
"learning_rate": 8.408712817388113e-06,
"loss": 0.4782,
"step": 460
},
{
"epoch": 1.1044323011536126,
"grad_norm": 0.50390625,
"learning_rate": 8.370244084638055e-06,
"loss": 0.5251,
"step": 461
},
{
"epoch": 1.1068609593199756,
"grad_norm": 0.53515625,
"learning_rate": 8.331800111947158e-06,
"loss": 0.5125,
"step": 462
},
{
"epoch": 1.1092896174863387,
"grad_norm": 0.5234375,
"learning_rate": 8.293381483375293e-06,
"loss": 0.5175,
"step": 463
},
{
"epoch": 1.111718275652702,
"grad_norm": 0.5234375,
"learning_rate": 8.254988782597295e-06,
"loss": 0.514,
"step": 464
},
{
"epoch": 1.114146933819065,
"grad_norm": 0.35546875,
"learning_rate": 8.216622592894097e-06,
"loss": 0.477,
"step": 465
},
{
"epoch": 1.116575591985428,
"grad_norm": 0.375,
"learning_rate": 8.178283497143851e-06,
"loss": 0.4873,
"step": 466
},
{
"epoch": 1.1190042501517912,
"grad_norm": 0.361328125,
"learning_rate": 8.139972077813093e-06,
"loss": 0.4805,
"step": 467
},
{
"epoch": 1.1214329083181542,
"grad_norm": 0.359375,
"learning_rate": 8.10168891694789e-06,
"loss": 0.4738,
"step": 468
},
{
"epoch": 1.1214329083181542,
"eval_loss": 0.5099829435348511,
"eval_runtime": 99.9835,
"eval_samples_per_second": 30.005,
"eval_steps_per_second": 3.751,
"step": 468
},
{
"epoch": 1.1238615664845173,
"grad_norm": 0.35546875,
"learning_rate": 8.063434596164974e-06,
"loss": 0.471,
"step": 469
},
{
"epoch": 1.1262902246508804,
"grad_norm": 0.357421875,
"learning_rate": 8.025209696642942e-06,
"loss": 0.4781,
"step": 470
},
{
"epoch": 1.1287188828172434,
"grad_norm": 0.353515625,
"learning_rate": 7.987014799113398e-06,
"loss": 0.4806,
"step": 471
},
{
"epoch": 1.1311475409836065,
"grad_norm": 0.35546875,
"learning_rate": 7.948850483852153e-06,
"loss": 0.4737,
"step": 472
},
{
"epoch": 1.1335761991499695,
"grad_norm": 0.6640625,
"learning_rate": 7.91071733067038e-06,
"loss": 0.5262,
"step": 473
},
{
"epoch": 1.1360048573163328,
"grad_norm": 0.365234375,
"learning_rate": 7.872615918905833e-06,
"loss": 0.4892,
"step": 474
},
{
"epoch": 1.138433515482696,
"grad_norm": 0.353515625,
"learning_rate": 7.83454682741404e-06,
"loss": 0.4825,
"step": 475
},
{
"epoch": 1.140862173649059,
"grad_norm": 0.353515625,
"learning_rate": 7.796510634559487e-06,
"loss": 0.4708,
"step": 476
},
{
"epoch": 1.143290831815422,
"grad_norm": 0.359375,
"learning_rate": 7.758507918206859e-06,
"loss": 0.474,
"step": 477
},
{
"epoch": 1.145719489981785,
"grad_norm": 0.357421875,
"learning_rate": 7.720539255712252e-06,
"loss": 0.4705,
"step": 478
},
{
"epoch": 1.1481481481481481,
"grad_norm": 0.3515625,
"learning_rate": 7.682605223914386e-06,
"loss": 0.4735,
"step": 479
},
{
"epoch": 1.1505768063145112,
"grad_norm": 0.3515625,
"learning_rate": 7.644706399125871e-06,
"loss": 0.4696,
"step": 480
},
{
"epoch": 1.1530054644808743,
"grad_norm": 0.365234375,
"learning_rate": 7.606843357124426e-06,
"loss": 0.4953,
"step": 481
},
{
"epoch": 1.1554341226472373,
"grad_norm": 0.3515625,
"learning_rate": 7.569016673144132e-06,
"loss": 0.4749,
"step": 482
},
{
"epoch": 1.1578627808136004,
"grad_norm": 0.353515625,
"learning_rate": 7.531226921866715e-06,
"loss": 0.4755,
"step": 483
},
{
"epoch": 1.1602914389799635,
"grad_norm": 0.353515625,
"learning_rate": 7.493474677412795e-06,
"loss": 0.4661,
"step": 484
},
{
"epoch": 1.1627200971463267,
"grad_norm": 0.63671875,
"learning_rate": 7.455760513333172e-06,
"loss": 0.5152,
"step": 485
},
{
"epoch": 1.1651487553126898,
"grad_norm": 0.35546875,
"learning_rate": 7.418085002600104e-06,
"loss": 0.4787,
"step": 486
},
{
"epoch": 1.1675774134790529,
"grad_norm": 0.353515625,
"learning_rate": 7.3804487175986135e-06,
"loss": 0.4718,
"step": 487
},
{
"epoch": 1.170006071645416,
"grad_norm": 0.3515625,
"learning_rate": 7.3428522301177894e-06,
"loss": 0.4728,
"step": 488
},
{
"epoch": 1.172434729811779,
"grad_norm": 0.353515625,
"learning_rate": 7.305296111342086e-06,
"loss": 0.4771,
"step": 489
},
{
"epoch": 1.174863387978142,
"grad_norm": 0.353515625,
"learning_rate": 7.267780931842667e-06,
"loss": 0.479,
"step": 490
},
{
"epoch": 1.177292046144505,
"grad_norm": 0.34765625,
"learning_rate": 7.230307261568725e-06,
"loss": 0.468,
"step": 491
},
{
"epoch": 1.1797207043108682,
"grad_norm": 0.353515625,
"learning_rate": 7.192875669838815e-06,
"loss": 0.475,
"step": 492
},
{
"epoch": 1.1821493624772312,
"grad_norm": 0.353515625,
"learning_rate": 7.155486725332224e-06,
"loss": 0.4683,
"step": 493
},
{
"epoch": 1.1845780206435945,
"grad_norm": 0.353515625,
"learning_rate": 7.118140996080313e-06,
"loss": 0.4818,
"step": 494
},
{
"epoch": 1.1845780206435945,
"eval_loss": 0.5090214610099792,
"eval_runtime": 100.5205,
"eval_samples_per_second": 29.845,
"eval_steps_per_second": 3.731,
"step": 494
},
{
"epoch": 1.1870066788099576,
"grad_norm": 0.48828125,
"learning_rate": 7.080839049457908e-06,
"loss": 0.513,
"step": 495
},
{
"epoch": 1.1894353369763206,
"grad_norm": 0.359375,
"learning_rate": 7.043581452174653e-06,
"loss": 0.4799,
"step": 496
},
{
"epoch": 1.1918639951426837,
"grad_norm": 0.5859375,
"learning_rate": 7.006368770266421e-06,
"loss": 0.5165,
"step": 497
},
{
"epoch": 1.1942926533090468,
"grad_norm": 0.349609375,
"learning_rate": 6.9692015690867135e-06,
"loss": 0.4774,
"step": 498
},
{
"epoch": 1.1967213114754098,
"grad_norm": 0.3515625,
"learning_rate": 6.932080413298055e-06,
"loss": 0.4723,
"step": 499
},
{
"epoch": 1.199149969641773,
"grad_norm": 0.357421875,
"learning_rate": 6.895005866863439e-06,
"loss": 0.4679,
"step": 500
},
{
"epoch": 1.201578627808136,
"grad_norm": 0.353515625,
"learning_rate": 6.857978493037734e-06,
"loss": 0.4769,
"step": 501
},
{
"epoch": 1.204007285974499,
"grad_norm": 0.357421875,
"learning_rate": 6.820998854359144e-06,
"loss": 0.4752,
"step": 502
},
{
"epoch": 1.206435944140862,
"grad_norm": 0.35546875,
"learning_rate": 6.784067512640666e-06,
"loss": 0.4781,
"step": 503
},
{
"epoch": 1.2088646023072251,
"grad_norm": 0.35546875,
"learning_rate": 6.7471850289615246e-06,
"loss": 0.4705,
"step": 504
},
{
"epoch": 1.2112932604735884,
"grad_norm": 0.7109375,
"learning_rate": 6.710351963658692e-06,
"loss": 0.5441,
"step": 505
},
{
"epoch": 1.2137219186399515,
"grad_norm": 0.35546875,
"learning_rate": 6.67356887631834e-06,
"loss": 0.4712,
"step": 506
},
{
"epoch": 1.2161505768063146,
"grad_norm": 0.357421875,
"learning_rate": 6.636836325767342e-06,
"loss": 0.4824,
"step": 507
},
{
"epoch": 1.2185792349726776,
"grad_norm": 0.357421875,
"learning_rate": 6.600154870064812e-06,
"loss": 0.4772,
"step": 508
},
{
"epoch": 1.2210078931390407,
"grad_norm": 0.349609375,
"learning_rate": 6.563525066493588e-06,
"loss": 0.4641,
"step": 509
},
{
"epoch": 1.2234365513054037,
"grad_norm": 0.35546875,
"learning_rate": 6.526947471551799e-06,
"loss": 0.4711,
"step": 510
},
{
"epoch": 1.2258652094717668,
"grad_norm": 0.34765625,
"learning_rate": 6.490422640944378e-06,
"loss": 0.4702,
"step": 511
},
{
"epoch": 1.2282938676381299,
"grad_norm": 0.353515625,
"learning_rate": 6.453951129574644e-06,
"loss": 0.4849,
"step": 512
},
{
"epoch": 1.230722525804493,
"grad_norm": 0.515625,
"learning_rate": 6.41753349153587e-06,
"loss": 0.5051,
"step": 513
},
{
"epoch": 1.2331511839708562,
"grad_norm": 0.3515625,
"learning_rate": 6.3811702801028465e-06,
"loss": 0.4701,
"step": 514
},
{
"epoch": 1.2355798421372193,
"grad_norm": 0.3515625,
"learning_rate": 6.344862047723495e-06,
"loss": 0.4765,
"step": 515
},
{
"epoch": 1.2380085003035823,
"grad_norm": 0.3515625,
"learning_rate": 6.30860934601047e-06,
"loss": 0.4827,
"step": 516
},
{
"epoch": 1.2404371584699454,
"grad_norm": 0.35546875,
"learning_rate": 6.272412725732767e-06,
"loss": 0.4787,
"step": 517
},
{
"epoch": 1.2428658166363085,
"grad_norm": 0.353515625,
"learning_rate": 6.236272736807378e-06,
"loss": 0.4825,
"step": 518
},
{
"epoch": 1.2452944748026715,
"grad_norm": 0.357421875,
"learning_rate": 6.200189928290916e-06,
"loss": 0.4799,
"step": 519
},
{
"epoch": 1.2477231329690346,
"grad_norm": 0.349609375,
"learning_rate": 6.1641648483712755e-06,
"loss": 0.4719,
"step": 520
},
{
"epoch": 1.2477231329690346,
"eval_loss": 0.5082234740257263,
"eval_runtime": 98.6416,
"eval_samples_per_second": 30.413,
"eval_steps_per_second": 3.802,
"step": 520
},
{
"epoch": 1.2501517911353976,
"grad_norm": 0.357421875,
"learning_rate": 6.128198044359322e-06,
"loss": 0.4689,
"step": 521
},
{
"epoch": 1.2525804493017607,
"grad_norm": 0.353515625,
"learning_rate": 6.09229006268055e-06,
"loss": 0.4821,
"step": 522
},
{
"epoch": 1.255009107468124,
"grad_norm": 0.359375,
"learning_rate": 6.056441448866817e-06,
"loss": 0.4793,
"step": 523
},
{
"epoch": 1.2574377656344868,
"grad_norm": 0.3515625,
"learning_rate": 6.020652747548008e-06,
"loss": 0.4761,
"step": 524
},
{
"epoch": 1.2598664238008501,
"grad_norm": 0.357421875,
"learning_rate": 5.984924502443807e-06,
"loss": 0.482,
"step": 525
},
{
"epoch": 1.2622950819672132,
"grad_norm": 0.345703125,
"learning_rate": 5.949257256355415e-06,
"loss": 0.4674,
"step": 526
},
{
"epoch": 1.2647237401335762,
"grad_norm": 0.34765625,
"learning_rate": 5.913651551157295e-06,
"loss": 0.4733,
"step": 527
},
{
"epoch": 1.2671523982999393,
"grad_norm": 0.349609375,
"learning_rate": 5.878107927788962e-06,
"loss": 0.4742,
"step": 528
},
{
"epoch": 1.2695810564663024,
"grad_norm": 0.349609375,
"learning_rate": 5.84262692624675e-06,
"loss": 0.476,
"step": 529
},
{
"epoch": 1.2720097146326654,
"grad_norm": 0.3515625,
"learning_rate": 5.8072090855756e-06,
"loss": 0.4698,
"step": 530
},
{
"epoch": 1.2744383727990285,
"grad_norm": 0.349609375,
"learning_rate": 5.7718549438609085e-06,
"loss": 0.4737,
"step": 531
},
{
"epoch": 1.2768670309653916,
"grad_norm": 0.3515625,
"learning_rate": 5.736565038220289e-06,
"loss": 0.4787,
"step": 532
},
{
"epoch": 1.2792956891317546,
"grad_norm": 0.3515625,
"learning_rate": 5.701339904795486e-06,
"loss": 0.4673,
"step": 533
},
{
"epoch": 1.281724347298118,
"grad_norm": 0.353515625,
"learning_rate": 5.666180078744169e-06,
"loss": 0.4786,
"step": 534
},
{
"epoch": 1.2841530054644807,
"grad_norm": 0.3515625,
"learning_rate": 5.6310860942318235e-06,
"loss": 0.4766,
"step": 535
},
{
"epoch": 1.286581663630844,
"grad_norm": 0.35546875,
"learning_rate": 5.5960584844236565e-06,
"loss": 0.4744,
"step": 536
},
{
"epoch": 1.289010321797207,
"grad_norm": 0.349609375,
"learning_rate": 5.561097781476463e-06,
"loss": 0.4706,
"step": 537
},
{
"epoch": 1.2914389799635702,
"grad_norm": 0.35546875,
"learning_rate": 5.5262045165305615e-06,
"loss": 0.474,
"step": 538
},
{
"epoch": 1.2938676381299332,
"grad_norm": 0.353515625,
"learning_rate": 5.491379219701718e-06,
"loss": 0.4737,
"step": 539
},
{
"epoch": 1.2962962962962963,
"grad_norm": 0.349609375,
"learning_rate": 5.456622420073084e-06,
"loss": 0.4797,
"step": 540
},
{
"epoch": 1.2987249544626593,
"grad_norm": 0.345703125,
"learning_rate": 5.421934645687185e-06,
"loss": 0.4779,
"step": 541
},
{
"epoch": 1.3011536126290224,
"grad_norm": 0.349609375,
"learning_rate": 5.387316423537869e-06,
"loss": 0.476,
"step": 542
},
{
"epoch": 1.3035822707953855,
"grad_norm": 0.353515625,
"learning_rate": 5.352768279562315e-06,
"loss": 0.4792,
"step": 543
},
{
"epoch": 1.3060109289617485,
"grad_norm": 0.51171875,
"learning_rate": 5.318290738633041e-06,
"loss": 0.5148,
"step": 544
},
{
"epoch": 1.3084395871281118,
"grad_norm": 0.3515625,
"learning_rate": 5.283884324549924e-06,
"loss": 0.4741,
"step": 545
},
{
"epoch": 1.3108682452944749,
"grad_norm": 0.3515625,
"learning_rate": 5.249549560032252e-06,
"loss": 0.4643,
"step": 546
},
{
"epoch": 1.3108682452944749,
"eval_loss": 0.5077295899391174,
"eval_runtime": 96.7746,
"eval_samples_per_second": 31.0,
"eval_steps_per_second": 3.875,
"step": 546
},
{
"epoch": 1.313296903460838,
"grad_norm": 0.353515625,
"learning_rate": 5.215286966710774e-06,
"loss": 0.4723,
"step": 547
},
{
"epoch": 1.315725561627201,
"grad_norm": 0.353515625,
"learning_rate": 5.18109706511978e-06,
"loss": 0.4812,
"step": 548
},
{
"epoch": 1.318154219793564,
"grad_norm": 0.359375,
"learning_rate": 5.146980374689192e-06,
"loss": 0.4683,
"step": 549
},
{
"epoch": 1.3205828779599271,
"grad_norm": 0.34765625,
"learning_rate": 5.112937413736667e-06,
"loss": 0.4731,
"step": 550
},
{
"epoch": 1.3230115361262902,
"grad_norm": 0.34765625,
"learning_rate": 5.078968699459736e-06,
"loss": 0.4687,
"step": 551
},
{
"epoch": 1.3254401942926533,
"grad_norm": 0.3515625,
"learning_rate": 5.045074747927927e-06,
"loss": 0.4781,
"step": 552
},
{
"epoch": 1.3278688524590163,
"grad_norm": 0.353515625,
"learning_rate": 5.011256074074945e-06,
"loss": 0.4764,
"step": 553
},
{
"epoch": 1.3302975106253796,
"grad_norm": 0.345703125,
"learning_rate": 4.977513191690834e-06,
"loss": 0.4628,
"step": 554
},
{
"epoch": 1.3327261687917424,
"grad_norm": 0.349609375,
"learning_rate": 4.943846613414172e-06,
"loss": 0.4751,
"step": 555
},
{
"epoch": 1.3351548269581057,
"grad_norm": 0.34765625,
"learning_rate": 4.910256850724306e-06,
"loss": 0.4742,
"step": 556
},
{
"epoch": 1.3375834851244688,
"grad_norm": 0.345703125,
"learning_rate": 4.8767444139335365e-06,
"loss": 0.4653,
"step": 557
},
{
"epoch": 1.3400121432908318,
"grad_norm": 0.3515625,
"learning_rate": 4.843309812179405e-06,
"loss": 0.4779,
"step": 558
},
{
"epoch": 1.342440801457195,
"grad_norm": 0.35546875,
"learning_rate": 4.809953553416954e-06,
"loss": 0.4845,
"step": 559
},
{
"epoch": 1.344869459623558,
"grad_norm": 0.35546875,
"learning_rate": 4.776676144410973e-06,
"loss": 0.4687,
"step": 560
},
{
"epoch": 1.347298117789921,
"grad_norm": 0.349609375,
"learning_rate": 4.743478090728356e-06,
"loss": 0.4819,
"step": 561
},
{
"epoch": 1.349726775956284,
"grad_norm": 0.349609375,
"learning_rate": 4.710359896730379e-06,
"loss": 0.4757,
"step": 562
},
{
"epoch": 1.3521554341226472,
"grad_norm": 0.3515625,
"learning_rate": 4.677322065565039e-06,
"loss": 0.4692,
"step": 563
},
{
"epoch": 1.3545840922890102,
"grad_norm": 0.349609375,
"learning_rate": 4.644365099159443e-06,
"loss": 0.4787,
"step": 564
},
{
"epoch": 1.3570127504553735,
"grad_norm": 0.50390625,
"learning_rate": 4.611489498212145e-06,
"loss": 0.5029,
"step": 565
},
{
"epoch": 1.3594414086217366,
"grad_norm": 0.34765625,
"learning_rate": 4.57869576218556e-06,
"loss": 0.473,
"step": 566
},
{
"epoch": 1.3618700667880996,
"grad_norm": 0.35546875,
"learning_rate": 4.545984389298371e-06,
"loss": 0.4751,
"step": 567
},
{
"epoch": 1.3642987249544627,
"grad_norm": 0.35546875,
"learning_rate": 4.5133558765179576e-06,
"loss": 0.4757,
"step": 568
},
{
"epoch": 1.3667273831208258,
"grad_norm": 0.34765625,
"learning_rate": 4.480810719552848e-06,
"loss": 0.4691,
"step": 569
},
{
"epoch": 1.3691560412871888,
"grad_norm": 0.35546875,
"learning_rate": 4.4483494128451885e-06,
"loss": 0.477,
"step": 570
},
{
"epoch": 1.3715846994535519,
"grad_norm": 0.3515625,
"learning_rate": 4.4159724495632295e-06,
"loss": 0.4775,
"step": 571
},
{
"epoch": 1.374013357619915,
"grad_norm": 0.349609375,
"learning_rate": 4.383680321593836e-06,
"loss": 0.4783,
"step": 572
},
{
"epoch": 1.374013357619915,
"eval_loss": 0.5073318481445312,
"eval_runtime": 103.2698,
"eval_samples_per_second": 29.05,
"eval_steps_per_second": 3.631,
"step": 572
},
{
"epoch": 1.376442015786278,
"grad_norm": 0.3515625,
"learning_rate": 4.35147351953501e-06,
"loss": 0.4735,
"step": 573
},
{
"epoch": 1.3788706739526413,
"grad_norm": 0.357421875,
"learning_rate": 4.319352532688444e-06,
"loss": 0.4667,
"step": 574
},
{
"epoch": 1.3812993321190041,
"grad_norm": 0.357421875,
"learning_rate": 4.287317849052075e-06,
"loss": 0.4788,
"step": 575
},
{
"epoch": 1.3837279902853674,
"grad_norm": 0.349609375,
"learning_rate": 4.255369955312698e-06,
"loss": 0.474,
"step": 576
},
{
"epoch": 1.3861566484517305,
"grad_norm": 0.349609375,
"learning_rate": 4.223509336838528e-06,
"loss": 0.4688,
"step": 577
},
{
"epoch": 1.3885853066180935,
"grad_norm": 0.353515625,
"learning_rate": 4.191736477671864e-06,
"loss": 0.4688,
"step": 578
},
{
"epoch": 1.3910139647844566,
"grad_norm": 0.3515625,
"learning_rate": 4.160051860521731e-06,
"loss": 0.4659,
"step": 579
},
{
"epoch": 1.3934426229508197,
"grad_norm": 0.35546875,
"learning_rate": 4.128455966756512e-06,
"loss": 0.4759,
"step": 580
},
{
"epoch": 1.3958712811171827,
"grad_norm": 0.35546875,
"learning_rate": 4.096949276396694e-06,
"loss": 0.4779,
"step": 581
},
{
"epoch": 1.3982999392835458,
"grad_norm": 0.3515625,
"learning_rate": 4.065532268107507e-06,
"loss": 0.4776,
"step": 582
},
{
"epoch": 1.4007285974499089,
"grad_norm": 0.3515625,
"learning_rate": 4.034205419191709e-06,
"loss": 0.4749,
"step": 583
},
{
"epoch": 1.403157255616272,
"grad_norm": 0.353515625,
"learning_rate": 4.002969205582314e-06,
"loss": 0.4791,
"step": 584
},
{
"epoch": 1.4055859137826352,
"grad_norm": 0.35546875,
"learning_rate": 3.971824101835341e-06,
"loss": 0.4723,
"step": 585
},
{
"epoch": 1.4080145719489983,
"grad_norm": 0.349609375,
"learning_rate": 3.940770581122634e-06,
"loss": 0.4803,
"step": 586
},
{
"epoch": 1.4104432301153613,
"grad_norm": 0.349609375,
"learning_rate": 3.909809115224674e-06,
"loss": 0.4667,
"step": 587
},
{
"epoch": 1.4128718882817244,
"grad_norm": 0.357421875,
"learning_rate": 3.878940174523371e-06,
"loss": 0.4795,
"step": 588
},
{
"epoch": 1.4153005464480874,
"grad_norm": 0.341796875,
"learning_rate": 3.848164227994976e-06,
"loss": 0.4631,
"step": 589
},
{
"epoch": 1.4177292046144505,
"grad_norm": 0.361328125,
"learning_rate": 3.8174817432029125e-06,
"loss": 0.4728,
"step": 590
},
{
"epoch": 1.4201578627808136,
"grad_norm": 0.345703125,
"learning_rate": 3.7868931862906756e-06,
"loss": 0.4658,
"step": 591
},
{
"epoch": 1.4225865209471766,
"grad_norm": 0.353515625,
"learning_rate": 3.7563990219747857e-06,
"loss": 0.4841,
"step": 592
},
{
"epoch": 1.4250151791135397,
"grad_norm": 0.3515625,
"learning_rate": 3.725999713537689e-06,
"loss": 0.4763,
"step": 593
},
{
"epoch": 1.427443837279903,
"grad_norm": 0.3515625,
"learning_rate": 3.695695722820737e-06,
"loss": 0.4804,
"step": 594
},
{
"epoch": 1.4298724954462658,
"grad_norm": 0.345703125,
"learning_rate": 3.6654875102171683e-06,
"loss": 0.4687,
"step": 595
},
{
"epoch": 1.432301153612629,
"grad_norm": 0.34765625,
"learning_rate": 3.635375534665111e-06,
"loss": 0.464,
"step": 596
},
{
"epoch": 1.4347298117789922,
"grad_norm": 0.34765625,
"learning_rate": 3.605360253640614e-06,
"loss": 0.4735,
"step": 597
},
{
"epoch": 1.4371584699453552,
"grad_norm": 0.34765625,
"learning_rate": 3.5754421231506953e-06,
"loss": 0.4782,
"step": 598
},
{
"epoch": 1.4371584699453552,
"eval_loss": 0.5070293545722961,
"eval_runtime": 107.3386,
"eval_samples_per_second": 27.949,
"eval_steps_per_second": 3.494,
"step": 598
},
{
"epoch": 1.4395871281117183,
"grad_norm": 0.353515625,
"learning_rate": 3.545621597726412e-06,
"loss": 0.4721,
"step": 599
},
{
"epoch": 1.4420157862780814,
"grad_norm": 0.357421875,
"learning_rate": 3.5158991304159572e-06,
"loss": 0.4755,
"step": 600
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.353515625,
"learning_rate": 3.48627517277778e-06,
"loss": 0.4827,
"step": 601
},
{
"epoch": 1.4468731026108075,
"grad_norm": 0.34765625,
"learning_rate": 3.4567501748737153e-06,
"loss": 0.4693,
"step": 602
},
{
"epoch": 1.4493017607771705,
"grad_norm": 0.349609375,
"learning_rate": 3.427324585262156e-06,
"loss": 0.468,
"step": 603
},
{
"epoch": 1.4517304189435336,
"grad_norm": 0.34375,
"learning_rate": 3.3979988509912443e-06,
"loss": 0.4715,
"step": 604
},
{
"epoch": 1.454159077109897,
"grad_norm": 0.353515625,
"learning_rate": 3.3687734175920505e-06,
"loss": 0.4844,
"step": 605
},
{
"epoch": 1.4565877352762597,
"grad_norm": 0.353515625,
"learning_rate": 3.339648729071836e-06,
"loss": 0.4731,
"step": 606
},
{
"epoch": 1.459016393442623,
"grad_norm": 0.34765625,
"learning_rate": 3.310625227907307e-06,
"loss": 0.4744,
"step": 607
},
{
"epoch": 1.461445051608986,
"grad_norm": 0.353515625,
"learning_rate": 3.281703355037854e-06,
"loss": 0.4771,
"step": 608
},
{
"epoch": 1.4638737097753491,
"grad_norm": 0.34765625,
"learning_rate": 3.2528835498589085e-06,
"loss": 0.471,
"step": 609
},
{
"epoch": 1.4663023679417122,
"grad_norm": 0.35546875,
"learning_rate": 3.2241662502152236e-06,
"loss": 0.4773,
"step": 610
},
{
"epoch": 1.4687310261080753,
"grad_norm": 0.349609375,
"learning_rate": 3.195551892394234e-06,
"loss": 0.4772,
"step": 611
},
{
"epoch": 1.4711596842744383,
"grad_norm": 0.35546875,
"learning_rate": 3.1670409111194454e-06,
"loss": 0.4707,
"step": 612
},
{
"epoch": 1.4735883424408014,
"grad_norm": 0.349609375,
"learning_rate": 3.138633739543805e-06,
"loss": 0.4759,
"step": 613
},
{
"epoch": 1.4760170006071647,
"grad_norm": 0.345703125,
"learning_rate": 3.110330809243134e-06,
"loss": 0.4693,
"step": 614
},
{
"epoch": 1.4784456587735275,
"grad_norm": 0.34375,
"learning_rate": 3.082132550209571e-06,
"loss": 0.4666,
"step": 615
},
{
"epoch": 1.4808743169398908,
"grad_norm": 0.349609375,
"learning_rate": 3.054039390845035e-06,
"loss": 0.4731,
"step": 616
},
{
"epoch": 1.4833029751062539,
"grad_norm": 0.34765625,
"learning_rate": 3.0260517579547166e-06,
"loss": 0.4782,
"step": 617
},
{
"epoch": 1.485731633272617,
"grad_norm": 0.5390625,
"learning_rate": 2.998170076740601e-06,
"loss": 0.5016,
"step": 618
},
{
"epoch": 1.48816029143898,
"grad_norm": 0.5078125,
"learning_rate": 2.9703947707949974e-06,
"loss": 0.5092,
"step": 619
},
{
"epoch": 1.490588949605343,
"grad_norm": 0.353515625,
"learning_rate": 2.9427262620941142e-06,
"loss": 0.4768,
"step": 620
},
{
"epoch": 1.4930176077717061,
"grad_norm": 0.349609375,
"learning_rate": 2.915164970991642e-06,
"loss": 0.4699,
"step": 621
},
{
"epoch": 1.4954462659380692,
"grad_norm": 0.34765625,
"learning_rate": 2.8877113162123637e-06,
"loss": 0.4729,
"step": 622
},
{
"epoch": 1.4978749241044322,
"grad_norm": 0.34765625,
"learning_rate": 2.8603657148458053e-06,
"loss": 0.4698,
"step": 623
},
{
"epoch": 1.5003035822707953,
"grad_norm": 0.34765625,
"learning_rate": 2.833128582339887e-06,
"loss": 0.4812,
"step": 624
},
{
"epoch": 1.5003035822707953,
"eval_loss": 0.5068376660346985,
"eval_runtime": 97.0195,
"eval_samples_per_second": 30.922,
"eval_steps_per_second": 3.865,
"step": 624
},
{
"epoch": 1.5027322404371586,
"grad_norm": 0.34765625,
"learning_rate": 2.806000332494617e-06,
"loss": 0.4651,
"step": 625
},
{
"epoch": 1.5051608986035214,
"grad_norm": 0.345703125,
"learning_rate": 2.778981377455806e-06,
"loss": 0.4681,
"step": 626
},
{
"epoch": 1.5075895567698847,
"grad_norm": 0.34765625,
"learning_rate": 2.7520721277088023e-06,
"loss": 0.4747,
"step": 627
},
{
"epoch": 1.5100182149362478,
"grad_norm": 0.353515625,
"learning_rate": 2.7252729920722564e-06,
"loss": 0.4736,
"step": 628
},
{
"epoch": 1.5124468731026108,
"grad_norm": 0.53125,
"learning_rate": 2.698584377691913e-06,
"loss": 0.5096,
"step": 629
},
{
"epoch": 1.514875531268974,
"grad_norm": 0.349609375,
"learning_rate": 2.6720066900344212e-06,
"loss": 0.4703,
"step": 630
},
{
"epoch": 1.517304189435337,
"grad_norm": 0.3515625,
"learning_rate": 2.6455403328811736e-06,
"loss": 0.4765,
"step": 631
},
{
"epoch": 1.5197328476017,
"grad_norm": 0.349609375,
"learning_rate": 2.6191857083221873e-06,
"loss": 0.4819,
"step": 632
},
{
"epoch": 1.522161505768063,
"grad_norm": 0.34765625,
"learning_rate": 2.5929432167499658e-06,
"loss": 0.4673,
"step": 633
},
{
"epoch": 1.5245901639344264,
"grad_norm": 0.34765625,
"learning_rate": 2.5668132568534377e-06,
"loss": 0.4748,
"step": 634
},
{
"epoch": 1.5270188221007892,
"grad_norm": 0.345703125,
"learning_rate": 2.540796225611907e-06,
"loss": 0.4674,
"step": 635
},
{
"epoch": 1.5294474802671525,
"grad_norm": 0.546875,
"learning_rate": 2.514892518288988e-06,
"loss": 0.5083,
"step": 636
},
{
"epoch": 1.5318761384335153,
"grad_norm": 0.51953125,
"learning_rate": 2.4891025284266436e-06,
"loss": 0.5049,
"step": 637
},
{
"epoch": 1.5343047965998786,
"grad_norm": 0.345703125,
"learning_rate": 2.463426647839173e-06,
"loss": 0.4701,
"step": 638
},
{
"epoch": 1.5367334547662417,
"grad_norm": 0.349609375,
"learning_rate": 2.4378652666072646e-06,
"loss": 0.4715,
"step": 639
},
{
"epoch": 1.5391621129326047,
"grad_norm": 0.498046875,
"learning_rate": 2.4124187730720916e-06,
"loss": 0.5031,
"step": 640
},
{
"epoch": 1.5415907710989678,
"grad_norm": 0.345703125,
"learning_rate": 2.387087553829386e-06,
"loss": 0.4734,
"step": 641
},
{
"epoch": 1.5440194292653309,
"grad_norm": 0.349609375,
"learning_rate": 2.361871993723579e-06,
"loss": 0.4649,
"step": 642
},
{
"epoch": 1.5464480874316942,
"grad_norm": 0.5,
"learning_rate": 2.3367724758419495e-06,
"loss": 0.5191,
"step": 643
},
{
"epoch": 1.548876745598057,
"grad_norm": 0.349609375,
"learning_rate": 2.3117893815088067e-06,
"loss": 0.4755,
"step": 644
},
{
"epoch": 1.5513054037644203,
"grad_norm": 0.353515625,
"learning_rate": 2.2869230902796934e-06,
"loss": 0.4805,
"step": 645
},
{
"epoch": 1.5537340619307831,
"grad_norm": 0.3515625,
"learning_rate": 2.2621739799356244e-06,
"loss": 0.4807,
"step": 646
},
{
"epoch": 1.5561627200971464,
"grad_norm": 0.349609375,
"learning_rate": 2.2375424264773447e-06,
"loss": 0.4818,
"step": 647
},
{
"epoch": 1.5585913782635095,
"grad_norm": 0.353515625,
"learning_rate": 2.2130288041196135e-06,
"loss": 0.4773,
"step": 648
},
{
"epoch": 1.5610200364298725,
"grad_norm": 0.34375,
"learning_rate": 2.188633485285525e-06,
"loss": 0.4696,
"step": 649
},
{
"epoch": 1.5634486945962356,
"grad_norm": 0.34765625,
"learning_rate": 2.1643568406008476e-06,
"loss": 0.4679,
"step": 650
},
{
"epoch": 1.5634486945962356,
"eval_loss": 0.5066995620727539,
"eval_runtime": 98.2878,
"eval_samples_per_second": 30.523,
"eval_steps_per_second": 3.815,
"step": 650
},
{
"epoch": 1.5658773527625987,
"grad_norm": 0.34765625,
"learning_rate": 2.1401992388883888e-06,
"loss": 0.4672,
"step": 651
},
{
"epoch": 1.5683060109289617,
"grad_norm": 0.33984375,
"learning_rate": 2.1161610471624084e-06,
"loss": 0.4629,
"step": 652
},
{
"epoch": 1.5707346690953248,
"grad_norm": 0.345703125,
"learning_rate": 2.092242630623016e-06,
"loss": 0.4701,
"step": 653
},
{
"epoch": 1.573163327261688,
"grad_norm": 0.349609375,
"learning_rate": 2.0684443526506415e-06,
"loss": 0.4767,
"step": 654
},
{
"epoch": 1.575591985428051,
"grad_norm": 0.349609375,
"learning_rate": 2.0447665748005206e-06,
"loss": 0.4677,
"step": 655
},
{
"epoch": 1.5780206435944142,
"grad_norm": 0.48046875,
"learning_rate": 2.021209656797174e-06,
"loss": 0.5038,
"step": 656
},
{
"epoch": 1.580449301760777,
"grad_norm": 0.349609375,
"learning_rate": 1.9977739565289743e-06,
"loss": 0.4732,
"step": 657
},
{
"epoch": 1.5828779599271403,
"grad_norm": 0.345703125,
"learning_rate": 1.974459830042691e-06,
"loss": 0.4743,
"step": 658
},
{
"epoch": 1.5853066180935034,
"grad_norm": 0.34375,
"learning_rate": 1.951267631538072e-06,
"loss": 0.4686,
"step": 659
},
{
"epoch": 1.5877352762598664,
"grad_norm": 0.59375,
"learning_rate": 1.928197713362495e-06,
"loss": 0.5074,
"step": 660
},
{
"epoch": 1.5901639344262295,
"grad_norm": 0.34765625,
"learning_rate": 1.9052504260055838e-06,
"loss": 0.4701,
"step": 661
},
{
"epoch": 1.5925925925925926,
"grad_norm": 0.34765625,
"learning_rate": 1.8824261180938875e-06,
"loss": 0.4757,
"step": 662
},
{
"epoch": 1.5950212507589556,
"grad_norm": 0.345703125,
"learning_rate": 1.8597251363856061e-06,
"loss": 0.4754,
"step": 663
},
{
"epoch": 1.5974499089253187,
"grad_norm": 0.345703125,
"learning_rate": 1.8371478257652908e-06,
"loss": 0.4718,
"step": 664
},
{
"epoch": 1.599878567091682,
"grad_norm": 0.349609375,
"learning_rate": 1.8146945292386343e-06,
"loss": 0.4765,
"step": 665
},
{
"epoch": 1.6023072252580448,
"grad_norm": 0.349609375,
"learning_rate": 1.7923655879272395e-06,
"loss": 0.4822,
"step": 666
},
{
"epoch": 1.604735883424408,
"grad_norm": 0.35546875,
"learning_rate": 1.7701613410634367e-06,
"loss": 0.4802,
"step": 667
},
{
"epoch": 1.607164541590771,
"grad_norm": 0.349609375,
"learning_rate": 1.7480821259851488e-06,
"loss": 0.4741,
"step": 668
},
{
"epoch": 1.6095931997571342,
"grad_norm": 0.349609375,
"learning_rate": 1.7261282781307486e-06,
"loss": 0.4686,
"step": 669
},
{
"epoch": 1.6120218579234973,
"grad_norm": 0.349609375,
"learning_rate": 1.7043001310339646e-06,
"loss": 0.4672,
"step": 670
},
{
"epoch": 1.6144505160898603,
"grad_norm": 0.34765625,
"learning_rate": 1.6825980163188204e-06,
"loss": 0.4727,
"step": 671
},
{
"epoch": 1.6168791742562234,
"grad_norm": 0.34765625,
"learning_rate": 1.661022263694594e-06,
"loss": 0.4805,
"step": 672
},
{
"epoch": 1.6193078324225865,
"grad_norm": 0.34765625,
"learning_rate": 1.6395732009508058e-06,
"loss": 0.469,
"step": 673
},
{
"epoch": 1.6217364905889498,
"grad_norm": 0.349609375,
"learning_rate": 1.6182511539522427e-06,
"loss": 0.4747,
"step": 674
},
{
"epoch": 1.6241651487553126,
"grad_norm": 0.34375,
"learning_rate": 1.5970564466340022e-06,
"loss": 0.4635,
"step": 675
},
{
"epoch": 1.6265938069216759,
"grad_norm": 0.349609375,
"learning_rate": 1.5759894009965793e-06,
"loss": 0.4725,
"step": 676
},
{
"epoch": 1.6265938069216759,
"eval_loss": 0.5065969824790955,
"eval_runtime": 97.0109,
"eval_samples_per_second": 30.924,
"eval_steps_per_second": 3.866,
"step": 676
},
{
"epoch": 1.6290224650880387,
"grad_norm": 0.349609375,
"learning_rate": 1.5550503371009652e-06,
"loss": 0.4762,
"step": 677
},
{
"epoch": 1.631451123254402,
"grad_norm": 0.34375,
"learning_rate": 1.5342395730637904e-06,
"loss": 0.4738,
"step": 678
},
{
"epoch": 1.633879781420765,
"grad_norm": 0.349609375,
"learning_rate": 1.5135574250524898e-06,
"loss": 0.4787,
"step": 679
},
{
"epoch": 1.6363084395871281,
"grad_norm": 0.345703125,
"learning_rate": 1.4930042072805062e-06,
"loss": 0.4681,
"step": 680
},
{
"epoch": 1.6387370977534912,
"grad_norm": 0.34765625,
"learning_rate": 1.4725802320024985e-06,
"loss": 0.4772,
"step": 681
},
{
"epoch": 1.6411657559198543,
"grad_norm": 0.349609375,
"learning_rate": 1.452285809509617e-06,
"loss": 0.4753,
"step": 682
},
{
"epoch": 1.6435944140862173,
"grad_norm": 0.3515625,
"learning_rate": 1.432121248124786e-06,
"loss": 0.4793,
"step": 683
},
{
"epoch": 1.6460230722525804,
"grad_norm": 0.3515625,
"learning_rate": 1.4120868541980026e-06,
"loss": 0.4766,
"step": 684
},
{
"epoch": 1.6484517304189437,
"grad_norm": 0.3515625,
"learning_rate": 1.39218293210171e-06,
"loss": 0.4742,
"step": 685
},
{
"epoch": 1.6508803885853065,
"grad_norm": 0.349609375,
"learning_rate": 1.372409784226152e-06,
"loss": 0.485,
"step": 686
},
{
"epoch": 1.6533090467516698,
"grad_norm": 0.3515625,
"learning_rate": 1.3527677109747784e-06,
"loss": 0.476,
"step": 687
},
{
"epoch": 1.6557377049180326,
"grad_norm": 0.349609375,
"learning_rate": 1.333257010759702e-06,
"loss": 0.4773,
"step": 688
},
{
"epoch": 1.658166363084396,
"grad_norm": 0.349609375,
"learning_rate": 1.3138779799971446e-06,
"loss": 0.4772,
"step": 689
},
{
"epoch": 1.660595021250759,
"grad_norm": 0.349609375,
"learning_rate": 1.294630913102939e-06,
"loss": 0.478,
"step": 690
},
{
"epoch": 1.663023679417122,
"grad_norm": 0.345703125,
"learning_rate": 1.2755161024880602e-06,
"loss": 0.472,
"step": 691
},
{
"epoch": 1.665452337583485,
"grad_norm": 0.34765625,
"learning_rate": 1.2565338385541792e-06,
"loss": 0.4716,
"step": 692
},
{
"epoch": 1.6678809957498482,
"grad_norm": 0.34375,
"learning_rate": 1.2376844096892526e-06,
"loss": 0.4646,
"step": 693
},
{
"epoch": 1.6703096539162114,
"grad_norm": 0.34765625,
"learning_rate": 1.2189681022631405e-06,
"loss": 0.4743,
"step": 694
},
{
"epoch": 1.6727383120825743,
"grad_norm": 0.34375,
"learning_rate": 1.2003852006232564e-06,
"loss": 0.4727,
"step": 695
},
{
"epoch": 1.6751669702489376,
"grad_norm": 0.34375,
"learning_rate": 1.181935987090247e-06,
"loss": 0.463,
"step": 696
},
{
"epoch": 1.6775956284153004,
"grad_norm": 0.349609375,
"learning_rate": 1.1636207419537038e-06,
"loss": 0.4799,
"step": 697
},
{
"epoch": 1.6800242865816637,
"grad_norm": 0.3515625,
"learning_rate": 1.1454397434679022e-06,
"loss": 0.4795,
"step": 698
},
{
"epoch": 1.6824529447480268,
"grad_norm": 0.3515625,
"learning_rate": 1.1273932678475764e-06,
"loss": 0.4748,
"step": 699
},
{
"epoch": 1.6848816029143898,
"grad_norm": 0.6171875,
"learning_rate": 1.1094815892637256e-06,
"loss": 0.5055,
"step": 700
},
{
"epoch": 1.6873102610807529,
"grad_norm": 0.34765625,
"learning_rate": 1.0917049798394408e-06,
"loss": 0.4721,
"step": 701
},
{
"epoch": 1.689738919247116,
"grad_norm": 0.345703125,
"learning_rate": 1.0740637096457773e-06,
"loss": 0.4645,
"step": 702
},
{
"epoch": 1.689738919247116,
"eval_loss": 0.5065945386886597,
"eval_runtime": 97.1015,
"eval_samples_per_second": 30.896,
"eval_steps_per_second": 3.862,
"step": 702
},
{
"epoch": 1.692167577413479,
"grad_norm": 0.353515625,
"learning_rate": 1.0565580466976566e-06,
"loss": 0.4757,
"step": 703
},
{
"epoch": 1.694596235579842,
"grad_norm": 0.3515625,
"learning_rate": 1.0391882569497758e-06,
"loss": 0.475,
"step": 704
},
{
"epoch": 1.6970248937462054,
"grad_norm": 0.345703125,
"learning_rate": 1.0219546042925842e-06,
"loss": 0.4777,
"step": 705
},
{
"epoch": 1.6994535519125682,
"grad_norm": 0.34765625,
"learning_rate": 1.0048573505482728e-06,
"loss": 0.4712,
"step": 706
},
{
"epoch": 1.7018822100789315,
"grad_norm": 0.54296875,
"learning_rate": 9.878967554667862e-07,
"loss": 0.5034,
"step": 707
},
{
"epoch": 1.7043108682452943,
"grad_norm": 0.34765625,
"learning_rate": 9.710730767218913e-07,
"loss": 0.469,
"step": 708
},
{
"epoch": 1.7067395264116576,
"grad_norm": 0.34375,
"learning_rate": 9.54386569907244e-07,
"loss": 0.4712,
"step": 709
},
{
"epoch": 1.7091681845780207,
"grad_norm": 0.34765625,
"learning_rate": 9.378374885325225e-07,
"loss": 0.4754,
"step": 710
},
{
"epoch": 1.7115968427443837,
"grad_norm": 0.345703125,
"learning_rate": 9.214260840195732e-07,
"loss": 0.4796,
"step": 711
},
{
"epoch": 1.7140255009107468,
"grad_norm": 0.345703125,
"learning_rate": 9.051526056985737e-07,
"loss": 0.467,
"step": 712
},
{
"epoch": 1.7164541590771099,
"grad_norm": 0.349609375,
"learning_rate": 8.890173008042768e-07,
"loss": 0.4749,
"step": 713
},
{
"epoch": 1.7188828172434731,
"grad_norm": 0.482421875,
"learning_rate": 8.730204144722232e-07,
"loss": 0.5046,
"step": 714
},
{
"epoch": 1.721311475409836,
"grad_norm": 0.34765625,
"learning_rate": 8.571621897350312e-07,
"loss": 0.4781,
"step": 715
},
{
"epoch": 1.7237401335761993,
"grad_norm": 0.34765625,
"learning_rate": 8.414428675187114e-07,
"loss": 0.4611,
"step": 716
},
{
"epoch": 1.726168791742562,
"grad_norm": 0.34375,
"learning_rate": 8.258626866389897e-07,
"loss": 0.4659,
"step": 717
},
{
"epoch": 1.7285974499089254,
"grad_norm": 0.34765625,
"learning_rate": 8.10421883797694e-07,
"loss": 0.467,
"step": 718
},
{
"epoch": 1.7310261080752884,
"grad_norm": 0.34375,
"learning_rate": 7.951206935791478e-07,
"loss": 0.4678,
"step": 719
},
{
"epoch": 1.7334547662416515,
"grad_norm": 0.345703125,
"learning_rate": 7.799593484466139e-07,
"loss": 0.4771,
"step": 720
},
{
"epoch": 1.7358834244080146,
"grad_norm": 0.34765625,
"learning_rate": 7.649380787387561e-07,
"loss": 0.4725,
"step": 721
},
{
"epoch": 1.7383120825743776,
"grad_norm": 0.34765625,
"learning_rate": 7.500571126661449e-07,
"loss": 0.4732,
"step": 722
},
{
"epoch": 1.7407407407407407,
"grad_norm": 0.34375,
"learning_rate": 7.35316676307789e-07,
"loss": 0.4716,
"step": 723
},
{
"epoch": 1.7431693989071038,
"grad_norm": 0.3515625,
"learning_rate": 7.207169936076974e-07,
"loss": 0.4721,
"step": 724
},
{
"epoch": 1.745598057073467,
"grad_norm": 0.34375,
"learning_rate": 7.06258286371484e-07,
"loss": 0.4726,
"step": 725
},
{
"epoch": 1.7480267152398299,
"grad_norm": 0.61328125,
"learning_rate": 6.919407742629891e-07,
"loss": 0.5167,
"step": 726
},
{
"epoch": 1.7504553734061932,
"grad_norm": 0.3515625,
"learning_rate": 6.77764674800947e-07,
"loss": 0.4826,
"step": 727
},
{
"epoch": 1.752884031572556,
"grad_norm": 0.349609375,
"learning_rate": 6.637302033556891e-07,
"loss": 0.4792,
"step": 728
},
{
"epoch": 1.752884031572556,
"eval_loss": 0.5065528750419617,
"eval_runtime": 97.2896,
"eval_samples_per_second": 30.836,
"eval_steps_per_second": 3.854,
"step": 728
},
{
"epoch": 1.7553126897389193,
"grad_norm": 0.34765625,
"learning_rate": 6.498375731458529e-07,
"loss": 0.4687,
"step": 729
},
{
"epoch": 1.7577413479052824,
"grad_norm": 0.349609375,
"learning_rate": 6.360869952351568e-07,
"loss": 0.4841,
"step": 730
},
{
"epoch": 1.7601700060716454,
"grad_norm": 0.34765625,
"learning_rate": 6.22478678529197e-07,
"loss": 0.4773,
"step": 731
},
{
"epoch": 1.7625986642380085,
"grad_norm": 0.3515625,
"learning_rate": 6.090128297722564e-07,
"loss": 0.476,
"step": 732
},
{
"epoch": 1.7650273224043715,
"grad_norm": 0.349609375,
"learning_rate": 5.956896535441803e-07,
"loss": 0.4749,
"step": 733
},
{
"epoch": 1.7674559805707348,
"grad_norm": 0.34765625,
"learning_rate": 5.825093522572666e-07,
"loss": 0.4828,
"step": 734
},
{
"epoch": 1.7698846387370977,
"grad_norm": 0.34375,
"learning_rate": 5.694721261531732e-07,
"loss": 0.4682,
"step": 735
},
{
"epoch": 1.772313296903461,
"grad_norm": 0.345703125,
"learning_rate": 5.565781732999043e-07,
"loss": 0.4733,
"step": 736
},
{
"epoch": 1.7747419550698238,
"grad_norm": 0.3515625,
"learning_rate": 5.438276895887761e-07,
"loss": 0.4767,
"step": 737
},
{
"epoch": 1.777170613236187,
"grad_norm": 0.35546875,
"learning_rate": 5.312208687314502e-07,
"loss": 0.4758,
"step": 738
},
{
"epoch": 1.7795992714025501,
"grad_norm": 0.3515625,
"learning_rate": 5.187579022569977e-07,
"loss": 0.4839,
"step": 739
},
{
"epoch": 1.7820279295689132,
"grad_norm": 0.470703125,
"learning_rate": 5.064389795089764e-07,
"loss": 0.5067,
"step": 740
},
{
"epoch": 1.7844565877352763,
"grad_norm": 0.5078125,
"learning_rate": 4.942642876425641e-07,
"loss": 0.5085,
"step": 741
},
{
"epoch": 1.7868852459016393,
"grad_norm": 0.34765625,
"learning_rate": 4.822340116217116e-07,
"loss": 0.4757,
"step": 742
},
{
"epoch": 1.7893139040680024,
"grad_norm": 0.349609375,
"learning_rate": 4.703483342163262e-07,
"loss": 0.4792,
"step": 743
},
{
"epoch": 1.7917425622343655,
"grad_norm": 0.349609375,
"learning_rate": 4.5860743599951186e-07,
"loss": 0.4667,
"step": 744
},
{
"epoch": 1.7941712204007287,
"grad_norm": 0.34765625,
"learning_rate": 4.470114953448079e-07,
"loss": 0.4772,
"step": 745
},
{
"epoch": 1.7965998785670916,
"grad_norm": 0.349609375,
"learning_rate": 4.3556068842348865e-07,
"loss": 0.4801,
"step": 746
},
{
"epoch": 1.7990285367334549,
"grad_norm": 0.34375,
"learning_rate": 4.2425518920188536e-07,
"loss": 0.4718,
"step": 747
},
{
"epoch": 1.8014571948998177,
"grad_norm": 0.349609375,
"learning_rate": 4.1309516943874196e-07,
"loss": 0.4731,
"step": 748
},
{
"epoch": 1.803885853066181,
"grad_norm": 0.349609375,
"learning_rate": 4.0208079868260696e-07,
"loss": 0.4812,
"step": 749
},
{
"epoch": 1.806314511232544,
"grad_norm": 0.345703125,
"learning_rate": 3.9121224426925675e-07,
"loss": 0.4739,
"step": 750
},
{
"epoch": 1.8087431693989071,
"grad_norm": 0.349609375,
"learning_rate": 3.8048967131915414e-07,
"loss": 0.4755,
"step": 751
},
{
"epoch": 1.8111718275652702,
"grad_norm": 0.349609375,
"learning_rate": 3.699132427349383e-07,
"loss": 0.4749,
"step": 752
},
{
"epoch": 1.8136004857316332,
"grad_norm": 0.345703125,
"learning_rate": 3.594831191989523e-07,
"loss": 0.4737,
"step": 753
},
{
"epoch": 1.8160291438979965,
"grad_norm": 0.34765625,
"learning_rate": 3.49199459170797e-07,
"loss": 0.4689,
"step": 754
},
{
"epoch": 1.8160291438979965,
"eval_loss": 0.506585955619812,
"eval_runtime": 101.0452,
"eval_samples_per_second": 29.69,
"eval_steps_per_second": 3.711,
"step": 754
},
{
"epoch": 1.8184578020643594,
"grad_norm": 0.34765625,
"learning_rate": 3.3906241888493005e-07,
"loss": 0.4732,
"step": 755
},
{
"epoch": 1.8208864602307226,
"grad_norm": 0.357421875,
"learning_rate": 3.2907215234829205e-07,
"loss": 0.4814,
"step": 756
},
{
"epoch": 1.8233151183970855,
"grad_norm": 0.34765625,
"learning_rate": 3.1922881133795827e-07,
"loss": 0.4705,
"step": 757
},
{
"epoch": 1.8257437765634488,
"grad_norm": 0.34765625,
"learning_rate": 3.095325453988385e-07,
"loss": 0.4727,
"step": 758
},
{
"epoch": 1.8281724347298116,
"grad_norm": 0.34375,
"learning_rate": 2.999835018414143e-07,
"loss": 0.4698,
"step": 759
},
{
"epoch": 1.830601092896175,
"grad_norm": 0.34765625,
"learning_rate": 2.905818257394799e-07,
"loss": 0.478,
"step": 760
},
{
"epoch": 1.833029751062538,
"grad_norm": 0.345703125,
"learning_rate": 2.8132765992795797e-07,
"loss": 0.4695,
"step": 761
},
{
"epoch": 1.835458409228901,
"grad_norm": 0.345703125,
"learning_rate": 2.722211450007206e-07,
"loss": 0.4722,
"step": 762
},
{
"epoch": 1.837887067395264,
"grad_norm": 0.345703125,
"learning_rate": 2.632624193084499e-07,
"loss": 0.4632,
"step": 763
},
{
"epoch": 1.8403157255616271,
"grad_norm": 0.34765625,
"learning_rate": 2.544516189565482e-07,
"loss": 0.4781,
"step": 764
},
{
"epoch": 1.8427443837279904,
"grad_norm": 0.345703125,
"learning_rate": 2.4578887780305704e-07,
"loss": 0.4755,
"step": 765
},
{
"epoch": 1.8451730418943533,
"grad_norm": 0.34765625,
"learning_rate": 2.3727432745663025e-07,
"loss": 0.4761,
"step": 766
},
{
"epoch": 1.8476017000607166,
"grad_norm": 0.349609375,
"learning_rate": 2.2890809727453612e-07,
"loss": 0.4747,
"step": 767
},
{
"epoch": 1.8500303582270794,
"grad_norm": 0.34765625,
"learning_rate": 2.2069031436068643e-07,
"loss": 0.4728,
"step": 768
},
{
"epoch": 1.8524590163934427,
"grad_norm": 0.349609375,
"learning_rate": 2.1262110356371047e-07,
"loss": 0.4824,
"step": 769
},
{
"epoch": 1.8548876745598057,
"grad_norm": 0.345703125,
"learning_rate": 2.0470058747505516e-07,
"loss": 0.4683,
"step": 770
},
{
"epoch": 1.8573163327261688,
"grad_norm": 0.3515625,
"learning_rate": 1.969288864271246e-07,
"loss": 0.4866,
"step": 771
},
{
"epoch": 1.8597449908925319,
"grad_norm": 0.3515625,
"learning_rate": 1.8930611849145131e-07,
"loss": 0.4797,
"step": 772
},
{
"epoch": 1.862173649058895,
"grad_norm": 0.345703125,
"learning_rate": 1.8183239947690112e-07,
"loss": 0.4676,
"step": 773
},
{
"epoch": 1.864602307225258,
"grad_norm": 0.34765625,
"learning_rate": 1.7450784292791456e-07,
"loss": 0.4668,
"step": 774
},
{
"epoch": 1.867030965391621,
"grad_norm": 0.34375,
"learning_rate": 1.6733256012278486e-07,
"loss": 0.4742,
"step": 775
},
{
"epoch": 1.8694596235579843,
"grad_norm": 0.34765625,
"learning_rate": 1.603066600719605e-07,
"loss": 0.4728,
"step": 776
},
{
"epoch": 1.8718882817243472,
"grad_norm": 0.34765625,
"learning_rate": 1.5343024951639752e-07,
"loss": 0.47,
"step": 777
},
{
"epoch": 1.8743169398907105,
"grad_norm": 0.34765625,
"learning_rate": 1.467034329259287e-07,
"loss": 0.4656,
"step": 778
},
{
"epoch": 1.8767455980570733,
"grad_norm": 0.3515625,
"learning_rate": 1.4012631249768592e-07,
"loss": 0.4858,
"step": 779
},
{
"epoch": 1.8791742562234366,
"grad_norm": 0.34765625,
"learning_rate": 1.336989881545403e-07,
"loss": 0.4646,
"step": 780
},
{
"epoch": 1.8791742562234366,
"eval_loss": 0.5065886974334717,
"eval_runtime": 100.9737,
"eval_samples_per_second": 29.711,
"eval_steps_per_second": 3.714,
"step": 780
},
{
"epoch": 1.8816029143897997,
"grad_norm": 0.349609375,
"learning_rate": 1.2742155754358553e-07,
"loss": 0.4823,
"step": 781
},
{
"epoch": 1.8840315725561627,
"grad_norm": 0.357421875,
"learning_rate": 1.2129411603465924e-07,
"loss": 0.4806,
"step": 782
},
{
"epoch": 1.8864602307225258,
"grad_norm": 0.3515625,
"learning_rate": 1.1531675671888621e-07,
"loss": 0.4909,
"step": 783
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.345703125,
"learning_rate": 1.0948957040727071e-07,
"loss": 0.4798,
"step": 784
},
{
"epoch": 1.8913175470552521,
"grad_norm": 0.34375,
"learning_rate": 1.0381264562931426e-07,
"loss": 0.4667,
"step": 785
},
{
"epoch": 1.893746205221615,
"grad_norm": 0.345703125,
"learning_rate": 9.828606863166779e-08,
"loss": 0.4703,
"step": 786
},
{
"epoch": 1.8961748633879782,
"grad_norm": 0.349609375,
"learning_rate": 9.290992337682936e-08,
"loss": 0.4799,
"step": 787
},
{
"epoch": 1.898603521554341,
"grad_norm": 0.35546875,
"learning_rate": 8.768429154185853e-08,
"loss": 0.478,
"step": 788
},
{
"epoch": 1.9010321797207044,
"grad_norm": 0.349609375,
"learning_rate": 8.260925251714514e-08,
"loss": 0.4779,
"step": 789
},
{
"epoch": 1.9034608378870674,
"grad_norm": 0.3515625,
"learning_rate": 7.768488340519464e-08,
"loss": 0.4801,
"step": 790
},
{
"epoch": 1.9058894960534305,
"grad_norm": 0.345703125,
"learning_rate": 7.291125901946027e-08,
"loss": 0.4716,
"step": 791
},
{
"epoch": 1.9083181542197936,
"grad_norm": 0.349609375,
"learning_rate": 6.828845188321054e-08,
"loss": 0.4739,
"step": 792
},
{
"epoch": 1.9107468123861566,
"grad_norm": 0.345703125,
"learning_rate": 6.381653222842011e-08,
"loss": 0.4673,
"step": 793
},
{
"epoch": 1.9131754705525197,
"grad_norm": 0.353515625,
"learning_rate": 5.949556799470846e-08,
"loss": 0.4853,
"step": 794
},
{
"epoch": 1.9156041287188827,
"grad_norm": 0.515625,
"learning_rate": 5.532562482830406e-08,
"loss": 0.5203,
"step": 795
},
{
"epoch": 1.918032786885246,
"grad_norm": 0.345703125,
"learning_rate": 5.1306766081048456e-08,
"loss": 0.4728,
"step": 796
},
{
"epoch": 1.9204614450516089,
"grad_norm": 0.64453125,
"learning_rate": 4.743905280943595e-08,
"loss": 0.5187,
"step": 797
},
{
"epoch": 1.9228901032179722,
"grad_norm": 0.349609375,
"learning_rate": 4.3722543773681016e-08,
"loss": 0.4797,
"step": 798
},
{
"epoch": 1.925318761384335,
"grad_norm": 0.345703125,
"learning_rate": 4.0157295436830116e-08,
"loss": 0.4678,
"step": 799
},
{
"epoch": 1.9277474195506983,
"grad_norm": 0.52734375,
"learning_rate": 3.674336196390238e-08,
"loss": 0.5106,
"step": 800
},
{
"epoch": 1.9301760777170613,
"grad_norm": 0.349609375,
"learning_rate": 3.3480795221066955e-08,
"loss": 0.4749,
"step": 801
},
{
"epoch": 1.9326047358834244,
"grad_norm": 0.34765625,
"learning_rate": 3.036964477485249e-08,
"loss": 0.4735,
"step": 802
},
{
"epoch": 1.9350333940497875,
"grad_norm": 0.353515625,
"learning_rate": 2.7409957891397775e-08,
"loss": 0.476,
"step": 803
},
{
"epoch": 1.9374620522161505,
"grad_norm": 0.345703125,
"learning_rate": 2.4601779535733394e-08,
"loss": 0.4695,
"step": 804
},
{
"epoch": 1.9398907103825138,
"grad_norm": 0.34765625,
"learning_rate": 2.1945152371094512e-08,
"loss": 0.4808,
"step": 805
},
{
"epoch": 1.9423193685488767,
"grad_norm": 0.349609375,
"learning_rate": 1.944011675827695e-08,
"loss": 0.4719,
"step": 806
},
{
"epoch": 1.9423193685488767,
"eval_loss": 0.5065895318984985,
"eval_runtime": 96.8435,
"eval_samples_per_second": 30.978,
"eval_steps_per_second": 3.872,
"step": 806
},
{
"epoch": 1.94474802671524,
"grad_norm": 0.349609375,
"learning_rate": 1.7086710755024327e-08,
"loss": 0.4724,
"step": 807
},
{
"epoch": 1.9471766848816028,
"grad_norm": 0.34765625,
"learning_rate": 1.4884970115444097e-08,
"loss": 0.4684,
"step": 808
},
{
"epoch": 1.949605343047966,
"grad_norm": 0.3515625,
"learning_rate": 1.2834928289472415e-08,
"loss": 0.4773,
"step": 809
},
{
"epoch": 1.9520340012143291,
"grad_norm": 0.349609375,
"learning_rate": 1.0936616422358992e-08,
"loss": 0.4767,
"step": 810
},
{
"epoch": 1.9544626593806922,
"grad_norm": 0.34765625,
"learning_rate": 9.190063354198586e-09,
"loss": 0.4771,
"step": 811
},
{
"epoch": 1.9568913175470553,
"grad_norm": 0.34375,
"learning_rate": 7.595295619490239e-09,
"loss": 0.4729,
"step": 812
},
{
"epoch": 1.9593199757134183,
"grad_norm": 0.3515625,
"learning_rate": 6.152337446736489e-09,
"loss": 0.4754,
"step": 813
},
{
"epoch": 1.9617486338797814,
"grad_norm": 0.353515625,
"learning_rate": 4.861210758071444e-09,
"loss": 0.4906,
"step": 814
},
{
"epoch": 1.9641772920461444,
"grad_norm": 0.345703125,
"learning_rate": 3.7219351689310455e-09,
"loss": 0.4767,
"step": 815
},
{
"epoch": 1.9666059502125077,
"grad_norm": 0.349609375,
"learning_rate": 2.734527987755531e-09,
"loss": 0.4862,
"step": 816
},
{
"epoch": 1.9690346083788706,
"grad_norm": 0.345703125,
"learning_rate": 1.899004215722977e-09,
"loss": 0.4682,
"step": 817
},
{
"epoch": 1.9714632665452339,
"grad_norm": 0.3515625,
"learning_rate": 1.2153765465250378e-09,
"loss": 0.4798,
"step": 818
},
{
"epoch": 1.9738919247115967,
"grad_norm": 0.34765625,
"learning_rate": 6.836553661715429e-10,
"loss": 0.4743,
"step": 819
},
{
"epoch": 1.97632058287796,
"grad_norm": 0.353515625,
"learning_rate": 3.038487528350675e-10,
"loss": 0.4736,
"step": 820
},
{
"epoch": 1.978749241044323,
"grad_norm": 0.341796875,
"learning_rate": 7.596247672325696e-11,
"loss": 0.4709,
"step": 821
},
{
"epoch": 1.981177899210686,
"grad_norm": 0.345703125,
"learning_rate": 0.0,
"loss": 0.4669,
"step": 822
}
],
"logging_steps": 1,
"max_steps": 822,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 411,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.8283124614508839e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}