random_QtVCxCRzyiwnbB1a / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
3aa3e1d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 708,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002824858757062147,
"grad_norm": 0.6147084683934281,
"learning_rate": 9.999950776495983e-06,
"loss": 0.1718,
"step": 1
},
{
"epoch": 0.005649717514124294,
"grad_norm": 0.6665089926445491,
"learning_rate": 9.99980310695311e-06,
"loss": 0.1906,
"step": 2
},
{
"epoch": 0.00847457627118644,
"grad_norm": 0.5879608524504671,
"learning_rate": 9.99955699427891e-06,
"loss": 0.1905,
"step": 3
},
{
"epoch": 0.011299435028248588,
"grad_norm": 0.5728624077474995,
"learning_rate": 9.999212443319191e-06,
"loss": 0.1806,
"step": 4
},
{
"epoch": 0.014124293785310734,
"grad_norm": 0.45407189292974287,
"learning_rate": 9.998769460857955e-06,
"loss": 0.1587,
"step": 5
},
{
"epoch": 0.01694915254237288,
"grad_norm": 0.695598470337452,
"learning_rate": 9.998228055617264e-06,
"loss": 0.3073,
"step": 6
},
{
"epoch": 0.01977401129943503,
"grad_norm": 0.4388502111835372,
"learning_rate": 9.99758823825706e-06,
"loss": 0.1658,
"step": 7
},
{
"epoch": 0.022598870056497175,
"grad_norm": 0.506295761059801,
"learning_rate": 9.996850021374969e-06,
"loss": 0.1822,
"step": 8
},
{
"epoch": 0.025423728813559324,
"grad_norm": 0.5054469922635372,
"learning_rate": 9.996013419506035e-06,
"loss": 0.1878,
"step": 9
},
{
"epoch": 0.02824858757062147,
"grad_norm": 0.5236855115593358,
"learning_rate": 9.99507844912245e-06,
"loss": 0.164,
"step": 10
},
{
"epoch": 0.031073446327683617,
"grad_norm": 0.6005034865395634,
"learning_rate": 9.994045128633221e-06,
"loss": 0.2148,
"step": 11
},
{
"epoch": 0.03389830508474576,
"grad_norm": 0.39196864541665416,
"learning_rate": 9.99291347838381e-06,
"loss": 0.1439,
"step": 12
},
{
"epoch": 0.03672316384180791,
"grad_norm": 0.3520974148515624,
"learning_rate": 9.991683520655735e-06,
"loss": 0.138,
"step": 13
},
{
"epoch": 0.03954802259887006,
"grad_norm": 0.39891278345339304,
"learning_rate": 9.990355279666124e-06,
"loss": 0.1778,
"step": 14
},
{
"epoch": 0.0423728813559322,
"grad_norm": 0.38823463524310226,
"learning_rate": 9.988928781567251e-06,
"loss": 0.1802,
"step": 15
},
{
"epoch": 0.04519774011299435,
"grad_norm": 0.3456396477919156,
"learning_rate": 9.987404054446009e-06,
"loss": 0.1537,
"step": 16
},
{
"epoch": 0.0480225988700565,
"grad_norm": 0.4433642134268957,
"learning_rate": 9.98578112832336e-06,
"loss": 0.2148,
"step": 17
},
{
"epoch": 0.05084745762711865,
"grad_norm": 0.5978046488506871,
"learning_rate": 9.984060035153752e-06,
"loss": 0.1769,
"step": 18
},
{
"epoch": 0.05367231638418079,
"grad_norm": 0.3537901451183181,
"learning_rate": 9.982240808824477e-06,
"loss": 0.1323,
"step": 19
},
{
"epoch": 0.05649717514124294,
"grad_norm": 0.5417108653864326,
"learning_rate": 9.980323485155013e-06,
"loss": 0.1666,
"step": 20
},
{
"epoch": 0.059322033898305086,
"grad_norm": 0.35289882003600165,
"learning_rate": 9.978308101896318e-06,
"loss": 0.1205,
"step": 21
},
{
"epoch": 0.062146892655367235,
"grad_norm": 0.36004933523243315,
"learning_rate": 9.97619469873008e-06,
"loss": 0.1371,
"step": 22
},
{
"epoch": 0.06497175141242938,
"grad_norm": 0.3854571301167785,
"learning_rate": 9.973983317267944e-06,
"loss": 0.1568,
"step": 23
},
{
"epoch": 0.06779661016949153,
"grad_norm": 0.6487420228246645,
"learning_rate": 9.971674001050687e-06,
"loss": 0.1647,
"step": 24
},
{
"epoch": 0.07062146892655367,
"grad_norm": 0.3854893336014017,
"learning_rate": 9.969266795547364e-06,
"loss": 0.1854,
"step": 25
},
{
"epoch": 0.07344632768361582,
"grad_norm": 0.3832699272168604,
"learning_rate": 9.96676174815441e-06,
"loss": 0.1955,
"step": 26
},
{
"epoch": 0.07627118644067797,
"grad_norm": 0.31760668668906533,
"learning_rate": 9.964158908194708e-06,
"loss": 0.1195,
"step": 27
},
{
"epoch": 0.07909604519774012,
"grad_norm": 0.41114801745145746,
"learning_rate": 9.961458326916624e-06,
"loss": 0.1414,
"step": 28
},
{
"epoch": 0.08192090395480225,
"grad_norm": 0.3240630544735708,
"learning_rate": 9.958660057492982e-06,
"loss": 0.1562,
"step": 29
},
{
"epoch": 0.0847457627118644,
"grad_norm": 0.3317790911132787,
"learning_rate": 9.955764155020037e-06,
"loss": 0.1263,
"step": 30
},
{
"epoch": 0.08757062146892655,
"grad_norm": 0.3730810477019189,
"learning_rate": 9.952770676516372e-06,
"loss": 0.1549,
"step": 31
},
{
"epoch": 0.0903954802259887,
"grad_norm": 0.3594977608871381,
"learning_rate": 9.94967968092179e-06,
"loss": 0.1362,
"step": 32
},
{
"epoch": 0.09322033898305085,
"grad_norm": 0.5395386567672124,
"learning_rate": 9.946491229096143e-06,
"loss": 0.1447,
"step": 33
},
{
"epoch": 0.096045197740113,
"grad_norm": 0.34069006187102624,
"learning_rate": 9.943205383818142e-06,
"loss": 0.1602,
"step": 34
},
{
"epoch": 0.09887005649717515,
"grad_norm": 0.40507986945355756,
"learning_rate": 9.93982220978411e-06,
"loss": 0.1727,
"step": 35
},
{
"epoch": 0.1016949152542373,
"grad_norm": 0.3101237427572441,
"learning_rate": 9.936341773606723e-06,
"loss": 0.1328,
"step": 36
},
{
"epoch": 0.10451977401129943,
"grad_norm": 0.32711902171143986,
"learning_rate": 9.932764143813686e-06,
"loss": 0.1377,
"step": 37
},
{
"epoch": 0.10734463276836158,
"grad_norm": 0.3698266114267764,
"learning_rate": 9.929089390846389e-06,
"loss": 0.1593,
"step": 38
},
{
"epoch": 0.11016949152542373,
"grad_norm": 0.36372051261110894,
"learning_rate": 9.925317587058516e-06,
"loss": 0.1238,
"step": 39
},
{
"epoch": 0.11299435028248588,
"grad_norm": 0.36933238520645995,
"learning_rate": 9.92144880671463e-06,
"loss": 0.1678,
"step": 40
},
{
"epoch": 0.11581920903954802,
"grad_norm": 0.3460461190525915,
"learning_rate": 9.9174831259887e-06,
"loss": 0.1578,
"step": 41
},
{
"epoch": 0.11864406779661017,
"grad_norm": 0.5499266672914876,
"learning_rate": 9.913420622962606e-06,
"loss": 0.1437,
"step": 42
},
{
"epoch": 0.12146892655367232,
"grad_norm": 0.4049849988270768,
"learning_rate": 9.909261377624601e-06,
"loss": 0.187,
"step": 43
},
{
"epoch": 0.12429378531073447,
"grad_norm": 0.47235530071518683,
"learning_rate": 9.90500547186774e-06,
"loss": 0.1449,
"step": 44
},
{
"epoch": 0.1271186440677966,
"grad_norm": 0.36137299735697387,
"learning_rate": 9.900652989488255e-06,
"loss": 0.1505,
"step": 45
},
{
"epoch": 0.12994350282485875,
"grad_norm": 0.372928772406179,
"learning_rate": 9.896204016183924e-06,
"loss": 0.1984,
"step": 46
},
{
"epoch": 0.1327683615819209,
"grad_norm": 0.31608553050802646,
"learning_rate": 9.891658639552368e-06,
"loss": 0.1233,
"step": 47
},
{
"epoch": 0.13559322033898305,
"grad_norm": 0.34009496825621116,
"learning_rate": 9.887016949089334e-06,
"loss": 0.1533,
"step": 48
},
{
"epoch": 0.1384180790960452,
"grad_norm": 0.3475244874058269,
"learning_rate": 9.882279036186927e-06,
"loss": 0.1348,
"step": 49
},
{
"epoch": 0.14124293785310735,
"grad_norm": 0.3465923771954804,
"learning_rate": 9.87744499413182e-06,
"loss": 0.1211,
"step": 50
},
{
"epoch": 0.1440677966101695,
"grad_norm": 0.4625874588181501,
"learning_rate": 9.872514918103407e-06,
"loss": 0.1316,
"step": 51
},
{
"epoch": 0.14689265536723164,
"grad_norm": 0.4219123981937324,
"learning_rate": 9.867488905171934e-06,
"loss": 0.1499,
"step": 52
},
{
"epoch": 0.1497175141242938,
"grad_norm": 0.28327838865809335,
"learning_rate": 9.86236705429659e-06,
"loss": 0.118,
"step": 53
},
{
"epoch": 0.15254237288135594,
"grad_norm": 0.42996698854118004,
"learning_rate": 9.85714946632355e-06,
"loss": 0.1445,
"step": 54
},
{
"epoch": 0.1553672316384181,
"grad_norm": 0.3158297884352036,
"learning_rate": 9.851836243984005e-06,
"loss": 0.1026,
"step": 55
},
{
"epoch": 0.15819209039548024,
"grad_norm": 0.34425358123925115,
"learning_rate": 9.846427491892117e-06,
"loss": 0.1296,
"step": 56
},
{
"epoch": 0.16101694915254236,
"grad_norm": 0.26454441870053136,
"learning_rate": 9.840923316542984e-06,
"loss": 0.0945,
"step": 57
},
{
"epoch": 0.1638418079096045,
"grad_norm": 0.47330273293079517,
"learning_rate": 9.835323826310522e-06,
"loss": 0.1645,
"step": 58
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.30609709369908644,
"learning_rate": 9.829629131445342e-06,
"loss": 0.122,
"step": 59
},
{
"epoch": 0.1694915254237288,
"grad_norm": 0.4330091411356326,
"learning_rate": 9.823839344072582e-06,
"loss": 0.1494,
"step": 60
},
{
"epoch": 0.17231638418079095,
"grad_norm": 0.42693180952210663,
"learning_rate": 9.817954578189686e-06,
"loss": 0.1235,
"step": 61
},
{
"epoch": 0.1751412429378531,
"grad_norm": 0.3324864445850298,
"learning_rate": 9.811974949664176e-06,
"loss": 0.1327,
"step": 62
},
{
"epoch": 0.17796610169491525,
"grad_norm": 0.41054779677427783,
"learning_rate": 9.805900576231358e-06,
"loss": 0.1741,
"step": 63
},
{
"epoch": 0.1807909604519774,
"grad_norm": 0.33830483714924636,
"learning_rate": 9.79973157749201e-06,
"loss": 0.1265,
"step": 64
},
{
"epoch": 0.18361581920903955,
"grad_norm": 0.3152187519500513,
"learning_rate": 9.793468074910028e-06,
"loss": 0.1202,
"step": 65
},
{
"epoch": 0.1864406779661017,
"grad_norm": 0.368592840409881,
"learning_rate": 9.787110191810027e-06,
"loss": 0.1292,
"step": 66
},
{
"epoch": 0.18926553672316385,
"grad_norm": 0.3471829002234472,
"learning_rate": 9.780658053374923e-06,
"loss": 0.1532,
"step": 67
},
{
"epoch": 0.192090395480226,
"grad_norm": 0.3461593726912688,
"learning_rate": 9.77411178664346e-06,
"loss": 0.1352,
"step": 68
},
{
"epoch": 0.19491525423728814,
"grad_norm": 0.647720230993773,
"learning_rate": 9.767471520507713e-06,
"loss": 0.1291,
"step": 69
},
{
"epoch": 0.1977401129943503,
"grad_norm": 0.31371308918917606,
"learning_rate": 9.760737385710546e-06,
"loss": 0.1363,
"step": 70
},
{
"epoch": 0.20056497175141244,
"grad_norm": 0.36616052339629535,
"learning_rate": 9.753909514843047e-06,
"loss": 0.163,
"step": 71
},
{
"epoch": 0.2033898305084746,
"grad_norm": 0.3366775524198522,
"learning_rate": 9.746988042341907e-06,
"loss": 0.1211,
"step": 72
},
{
"epoch": 0.2062146892655367,
"grad_norm": 0.32238872392227863,
"learning_rate": 9.739973104486777e-06,
"loss": 0.1244,
"step": 73
},
{
"epoch": 0.20903954802259886,
"grad_norm": 0.309699834626684,
"learning_rate": 9.732864839397585e-06,
"loss": 0.1123,
"step": 74
},
{
"epoch": 0.211864406779661,
"grad_norm": 0.40415497402853257,
"learning_rate": 9.725663387031818e-06,
"loss": 0.1185,
"step": 75
},
{
"epoch": 0.21468926553672316,
"grad_norm": 0.43571692379160615,
"learning_rate": 9.718368889181763e-06,
"loss": 0.1205,
"step": 76
},
{
"epoch": 0.2175141242937853,
"grad_norm": 0.3220023585216405,
"learning_rate": 9.710981489471721e-06,
"loss": 0.1513,
"step": 77
},
{
"epoch": 0.22033898305084745,
"grad_norm": 0.41630953777961766,
"learning_rate": 9.703501333355167e-06,
"loss": 0.1249,
"step": 78
},
{
"epoch": 0.2231638418079096,
"grad_norm": 0.5019640716602216,
"learning_rate": 9.6959285681119e-06,
"loss": 0.1914,
"step": 79
},
{
"epoch": 0.22598870056497175,
"grad_norm": 0.4023458378359013,
"learning_rate": 9.68826334284514e-06,
"loss": 0.1346,
"step": 80
},
{
"epoch": 0.2288135593220339,
"grad_norm": 0.34337841648791245,
"learning_rate": 9.680505808478583e-06,
"loss": 0.1272,
"step": 81
},
{
"epoch": 0.23163841807909605,
"grad_norm": 0.32382210914063797,
"learning_rate": 9.672656117753435e-06,
"loss": 0.1155,
"step": 82
},
{
"epoch": 0.2344632768361582,
"grad_norm": 0.4098493186152155,
"learning_rate": 9.664714425225414e-06,
"loss": 0.155,
"step": 83
},
{
"epoch": 0.23728813559322035,
"grad_norm": 0.30680580576204597,
"learning_rate": 9.656680887261693e-06,
"loss": 0.1268,
"step": 84
},
{
"epoch": 0.2401129943502825,
"grad_norm": 0.34530888636266843,
"learning_rate": 9.648555662037826e-06,
"loss": 0.1255,
"step": 85
},
{
"epoch": 0.24293785310734464,
"grad_norm": 0.31982112281364816,
"learning_rate": 9.640338909534636e-06,
"loss": 0.1187,
"step": 86
},
{
"epoch": 0.2457627118644068,
"grad_norm": 0.3249369835609272,
"learning_rate": 9.632030791535063e-06,
"loss": 0.1094,
"step": 87
},
{
"epoch": 0.24858757062146894,
"grad_norm": 0.4202492399280893,
"learning_rate": 9.62363147162098e-06,
"loss": 0.1473,
"step": 88
},
{
"epoch": 0.2514124293785311,
"grad_norm": 0.4206786168286477,
"learning_rate": 9.615141115169968e-06,
"loss": 0.1321,
"step": 89
},
{
"epoch": 0.2542372881355932,
"grad_norm": 0.35455170569024247,
"learning_rate": 9.606559889352065e-06,
"loss": 0.1333,
"step": 90
},
{
"epoch": 0.2570621468926554,
"grad_norm": 0.30654274314933255,
"learning_rate": 9.597887963126476e-06,
"loss": 0.1324,
"step": 91
},
{
"epoch": 0.2598870056497175,
"grad_norm": 0.31426903973238157,
"learning_rate": 9.589125507238234e-06,
"loss": 0.0965,
"step": 92
},
{
"epoch": 0.2627118644067797,
"grad_norm": 0.39888445390797206,
"learning_rate": 9.580272694214855e-06,
"loss": 0.1795,
"step": 93
},
{
"epoch": 0.2655367231638418,
"grad_norm": 0.32688334776107986,
"learning_rate": 9.571329698362931e-06,
"loss": 0.0993,
"step": 94
},
{
"epoch": 0.268361581920904,
"grad_norm": 0.37334590333180434,
"learning_rate": 9.562296695764695e-06,
"loss": 0.1201,
"step": 95
},
{
"epoch": 0.2711864406779661,
"grad_norm": 0.3189896682514011,
"learning_rate": 9.553173864274567e-06,
"loss": 0.1261,
"step": 96
},
{
"epoch": 0.2740112994350282,
"grad_norm": 0.34974753780587814,
"learning_rate": 9.543961383515638e-06,
"loss": 0.1476,
"step": 97
},
{
"epoch": 0.2768361581920904,
"grad_norm": 0.3593944813062625,
"learning_rate": 9.53465943487614e-06,
"loss": 0.123,
"step": 98
},
{
"epoch": 0.2796610169491525,
"grad_norm": 0.35812916144207474,
"learning_rate": 9.52526820150588e-06,
"loss": 0.1257,
"step": 99
},
{
"epoch": 0.2824858757062147,
"grad_norm": 0.32235620707593265,
"learning_rate": 9.51578786831262e-06,
"loss": 0.1493,
"step": 100
},
{
"epoch": 0.2853107344632768,
"grad_norm": 0.37142164119731536,
"learning_rate": 9.506218621958448e-06,
"loss": 0.1278,
"step": 101
},
{
"epoch": 0.288135593220339,
"grad_norm": 0.44435482682594635,
"learning_rate": 9.496560650856097e-06,
"loss": 0.1443,
"step": 102
},
{
"epoch": 0.2909604519774011,
"grad_norm": 0.3566707987782221,
"learning_rate": 9.486814145165242e-06,
"loss": 0.1122,
"step": 103
},
{
"epoch": 0.2937853107344633,
"grad_norm": 0.33349931081864886,
"learning_rate": 9.476979296788746e-06,
"loss": 0.1185,
"step": 104
},
{
"epoch": 0.2966101694915254,
"grad_norm": 0.3196942221194822,
"learning_rate": 9.467056299368888e-06,
"loss": 0.1405,
"step": 105
},
{
"epoch": 0.2994350282485876,
"grad_norm": 0.300108694500991,
"learning_rate": 9.457045348283552e-06,
"loss": 0.1032,
"step": 106
},
{
"epoch": 0.3022598870056497,
"grad_norm": 0.2628183218575623,
"learning_rate": 9.446946640642372e-06,
"loss": 0.0891,
"step": 107
},
{
"epoch": 0.3050847457627119,
"grad_norm": 0.3173943921412448,
"learning_rate": 9.436760375282858e-06,
"loss": 0.1389,
"step": 108
},
{
"epoch": 0.307909604519774,
"grad_norm": 0.3374023308082592,
"learning_rate": 9.426486752766481e-06,
"loss": 0.1433,
"step": 109
},
{
"epoch": 0.3107344632768362,
"grad_norm": 0.32296890768155634,
"learning_rate": 9.416125975374722e-06,
"loss": 0.1489,
"step": 110
},
{
"epoch": 0.3135593220338983,
"grad_norm": 0.809125754824113,
"learning_rate": 9.405678247105083e-06,
"loss": 0.1197,
"step": 111
},
{
"epoch": 0.3163841807909605,
"grad_norm": 0.2899410860114942,
"learning_rate": 9.395143773667089e-06,
"loss": 0.1101,
"step": 112
},
{
"epoch": 0.3192090395480226,
"grad_norm": 0.35261776609510237,
"learning_rate": 9.38452276247821e-06,
"loss": 0.15,
"step": 113
},
{
"epoch": 0.3220338983050847,
"grad_norm": 0.40136031754220597,
"learning_rate": 9.373815422659806e-06,
"loss": 0.1822,
"step": 114
},
{
"epoch": 0.3248587570621469,
"grad_norm": 0.313017574757418,
"learning_rate": 9.363021965032993e-06,
"loss": 0.1188,
"step": 115
},
{
"epoch": 0.327683615819209,
"grad_norm": 0.373452387713479,
"learning_rate": 9.352142602114487e-06,
"loss": 0.137,
"step": 116
},
{
"epoch": 0.3305084745762712,
"grad_norm": 0.3420346614883704,
"learning_rate": 9.341177548112437e-06,
"loss": 0.1344,
"step": 117
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.4036780338239374,
"learning_rate": 9.330127018922195e-06,
"loss": 0.133,
"step": 118
},
{
"epoch": 0.3361581920903955,
"grad_norm": 0.32756640437501205,
"learning_rate": 9.318991232122065e-06,
"loss": 0.1321,
"step": 119
},
{
"epoch": 0.3389830508474576,
"grad_norm": 0.345171287549607,
"learning_rate": 9.307770406969032e-06,
"loss": 0.1202,
"step": 120
},
{
"epoch": 0.3418079096045198,
"grad_norm": 0.437595297190657,
"learning_rate": 9.296464764394422e-06,
"loss": 0.1824,
"step": 121
},
{
"epoch": 0.3446327683615819,
"grad_norm": 0.2838090630299304,
"learning_rate": 9.285074526999577e-06,
"loss": 0.114,
"step": 122
},
{
"epoch": 0.3474576271186441,
"grad_norm": 0.33725871702683274,
"learning_rate": 9.273599919051452e-06,
"loss": 0.1254,
"step": 123
},
{
"epoch": 0.3502824858757062,
"grad_norm": 0.2980485886022378,
"learning_rate": 9.262041166478215e-06,
"loss": 0.1139,
"step": 124
},
{
"epoch": 0.3531073446327684,
"grad_norm": 0.35961892985059185,
"learning_rate": 9.250398496864782e-06,
"loss": 0.114,
"step": 125
},
{
"epoch": 0.3559322033898305,
"grad_norm": 0.44215669470603974,
"learning_rate": 9.238672139448354e-06,
"loss": 0.1133,
"step": 126
},
{
"epoch": 0.3587570621468927,
"grad_norm": 0.4377017180308674,
"learning_rate": 9.226862325113894e-06,
"loss": 0.1438,
"step": 127
},
{
"epoch": 0.3615819209039548,
"grad_norm": 0.3248130129102471,
"learning_rate": 9.214969286389577e-06,
"loss": 0.1079,
"step": 128
},
{
"epoch": 0.3644067796610169,
"grad_norm": 0.3687271690510821,
"learning_rate": 9.202993257442216e-06,
"loss": 0.1576,
"step": 129
},
{
"epoch": 0.3672316384180791,
"grad_norm": 0.31854432765060964,
"learning_rate": 9.190934474072658e-06,
"loss": 0.1128,
"step": 130
},
{
"epoch": 0.3700564971751412,
"grad_norm": 0.43632199490503926,
"learning_rate": 9.178793173711133e-06,
"loss": 0.16,
"step": 131
},
{
"epoch": 0.3728813559322034,
"grad_norm": 0.3184995213048756,
"learning_rate": 9.166569595412576e-06,
"loss": 0.1087,
"step": 132
},
{
"epoch": 0.3757062146892655,
"grad_norm": 0.2852410374038083,
"learning_rate": 9.154263979851932e-06,
"loss": 0.1017,
"step": 133
},
{
"epoch": 0.3785310734463277,
"grad_norm": 0.4894599992720483,
"learning_rate": 9.141876569319405e-06,
"loss": 0.1338,
"step": 134
},
{
"epoch": 0.3813559322033898,
"grad_norm": 0.3025442389978783,
"learning_rate": 9.129407607715697e-06,
"loss": 0.1029,
"step": 135
},
{
"epoch": 0.384180790960452,
"grad_norm": 0.3362331028333517,
"learning_rate": 9.116857340547203e-06,
"loss": 0.1171,
"step": 136
},
{
"epoch": 0.3870056497175141,
"grad_norm": 0.3058727798799934,
"learning_rate": 9.104226014921171e-06,
"loss": 0.1242,
"step": 137
},
{
"epoch": 0.3898305084745763,
"grad_norm": 0.32225074355114075,
"learning_rate": 9.091513879540845e-06,
"loss": 0.1199,
"step": 138
},
{
"epoch": 0.3926553672316384,
"grad_norm": 0.4160727630117183,
"learning_rate": 9.078721184700565e-06,
"loss": 0.1703,
"step": 139
},
{
"epoch": 0.3954802259887006,
"grad_norm": 0.3825137129544069,
"learning_rate": 9.065848182280835e-06,
"loss": 0.1417,
"step": 140
},
{
"epoch": 0.3983050847457627,
"grad_norm": 0.3109793025220252,
"learning_rate": 9.05289512574337e-06,
"loss": 0.1079,
"step": 141
},
{
"epoch": 0.4011299435028249,
"grad_norm": 0.32403338158384704,
"learning_rate": 9.039862270126102e-06,
"loss": 0.1304,
"step": 142
},
{
"epoch": 0.403954802259887,
"grad_norm": 0.333516403508159,
"learning_rate": 9.026749872038161e-06,
"loss": 0.1215,
"step": 143
},
{
"epoch": 0.4067796610169492,
"grad_norm": 0.5512785845644815,
"learning_rate": 9.013558189654819e-06,
"loss": 0.119,
"step": 144
},
{
"epoch": 0.4096045197740113,
"grad_norm": 0.3342692670079559,
"learning_rate": 9.000287482712407e-06,
"loss": 0.1327,
"step": 145
},
{
"epoch": 0.4124293785310734,
"grad_norm": 0.3844315934040279,
"learning_rate": 8.986938012503203e-06,
"loss": 0.1354,
"step": 146
},
{
"epoch": 0.4152542372881356,
"grad_norm": 0.3704715200009876,
"learning_rate": 8.973510041870287e-06,
"loss": 0.116,
"step": 147
},
{
"epoch": 0.4180790960451977,
"grad_norm": 0.3731805906895666,
"learning_rate": 8.960003835202369e-06,
"loss": 0.1447,
"step": 148
},
{
"epoch": 0.4209039548022599,
"grad_norm": 0.3074985687330806,
"learning_rate": 8.946419658428573e-06,
"loss": 0.0944,
"step": 149
},
{
"epoch": 0.423728813559322,
"grad_norm": 0.43341288159333946,
"learning_rate": 8.932757779013214e-06,
"loss": 0.1266,
"step": 150
},
{
"epoch": 0.4265536723163842,
"grad_norm": 0.336284725389875,
"learning_rate": 8.919018465950517e-06,
"loss": 0.1398,
"step": 151
},
{
"epoch": 0.4293785310734463,
"grad_norm": 0.35240140744565995,
"learning_rate": 8.90520198975934e-06,
"loss": 0.161,
"step": 152
},
{
"epoch": 0.4322033898305085,
"grad_norm": 0.33406617309219283,
"learning_rate": 8.89130862247783e-06,
"loss": 0.1452,
"step": 153
},
{
"epoch": 0.4350282485875706,
"grad_norm": 0.4037611933501478,
"learning_rate": 8.877338637658074e-06,
"loss": 0.1355,
"step": 154
},
{
"epoch": 0.4378531073446328,
"grad_norm": 0.3475789016676025,
"learning_rate": 8.863292310360716e-06,
"loss": 0.1298,
"step": 155
},
{
"epoch": 0.4406779661016949,
"grad_norm": 0.4841263150458153,
"learning_rate": 8.849169917149532e-06,
"loss": 0.1207,
"step": 156
},
{
"epoch": 0.4435028248587571,
"grad_norm": 0.30177409440633907,
"learning_rate": 8.834971736085995e-06,
"loss": 0.1092,
"step": 157
},
{
"epoch": 0.4463276836158192,
"grad_norm": 0.3405364099196524,
"learning_rate": 8.820698046723796e-06,
"loss": 0.1196,
"step": 158
},
{
"epoch": 0.4491525423728814,
"grad_norm": 0.3399165082210464,
"learning_rate": 8.806349130103334e-06,
"loss": 0.1215,
"step": 159
},
{
"epoch": 0.4519774011299435,
"grad_norm": 0.30309252729845493,
"learning_rate": 8.791925268746193e-06,
"loss": 0.133,
"step": 160
},
{
"epoch": 0.4548022598870056,
"grad_norm": 0.4127573259391668,
"learning_rate": 8.777426746649571e-06,
"loss": 0.1131,
"step": 161
},
{
"epoch": 0.4576271186440678,
"grad_norm": 0.3210525339974507,
"learning_rate": 8.762853849280692e-06,
"loss": 0.1097,
"step": 162
},
{
"epoch": 0.4604519774011299,
"grad_norm": 0.38978996569997443,
"learning_rate": 8.748206863571188e-06,
"loss": 0.1259,
"step": 163
},
{
"epoch": 0.4632768361581921,
"grad_norm": 0.3200510621145131,
"learning_rate": 8.73348607791144e-06,
"loss": 0.1028,
"step": 164
},
{
"epoch": 0.4661016949152542,
"grad_norm": 0.5781086641593365,
"learning_rate": 8.718691782144908e-06,
"loss": 0.1504,
"step": 165
},
{
"epoch": 0.4689265536723164,
"grad_norm": 0.3161434045596249,
"learning_rate": 8.703824267562424e-06,
"loss": 0.1341,
"step": 166
},
{
"epoch": 0.4717514124293785,
"grad_norm": 0.2920424513956196,
"learning_rate": 8.688883826896458e-06,
"loss": 0.0985,
"step": 167
},
{
"epoch": 0.4745762711864407,
"grad_norm": 0.3862783048815386,
"learning_rate": 8.673870754315336e-06,
"loss": 0.1352,
"step": 168
},
{
"epoch": 0.4774011299435028,
"grad_norm": 0.4426996657474187,
"learning_rate": 8.658785345417484e-06,
"loss": 0.1414,
"step": 169
},
{
"epoch": 0.480225988700565,
"grad_norm": 0.461223196307408,
"learning_rate": 8.64362789722557e-06,
"loss": 0.1397,
"step": 170
},
{
"epoch": 0.4830508474576271,
"grad_norm": 0.3236362093117533,
"learning_rate": 8.62839870818068e-06,
"loss": 0.1185,
"step": 171
},
{
"epoch": 0.4858757062146893,
"grad_norm": 0.5017393956700863,
"learning_rate": 8.613098078136436e-06,
"loss": 0.1301,
"step": 172
},
{
"epoch": 0.4887005649717514,
"grad_norm": 0.3476632540894821,
"learning_rate": 8.597726308353085e-06,
"loss": 0.1265,
"step": 173
},
{
"epoch": 0.4915254237288136,
"grad_norm": 0.38990017498501656,
"learning_rate": 8.582283701491576e-06,
"loss": 0.152,
"step": 174
},
{
"epoch": 0.4943502824858757,
"grad_norm": 0.3190826675735708,
"learning_rate": 8.566770561607598e-06,
"loss": 0.1281,
"step": 175
},
{
"epoch": 0.4971751412429379,
"grad_norm": 0.3967141406957403,
"learning_rate": 8.551187194145591e-06,
"loss": 0.1546,
"step": 176
},
{
"epoch": 0.5,
"grad_norm": 0.3328836982483565,
"learning_rate": 8.535533905932739e-06,
"loss": 0.1314,
"step": 177
},
{
"epoch": 0.5028248587570622,
"grad_norm": 0.374457685382645,
"learning_rate": 8.519811005172916e-06,
"loss": 0.1165,
"step": 178
},
{
"epoch": 0.5056497175141242,
"grad_norm": 0.363385614117873,
"learning_rate": 8.50401880144063e-06,
"loss": 0.1226,
"step": 179
},
{
"epoch": 0.5084745762711864,
"grad_norm": 0.32620237927339973,
"learning_rate": 8.488157605674924e-06,
"loss": 0.1203,
"step": 180
},
{
"epoch": 0.5112994350282486,
"grad_norm": 0.2961670016082417,
"learning_rate": 8.472227730173252e-06,
"loss": 0.104,
"step": 181
},
{
"epoch": 0.5141242937853108,
"grad_norm": 0.3350355060825277,
"learning_rate": 8.456229488585328e-06,
"loss": 0.1136,
"step": 182
},
{
"epoch": 0.5169491525423728,
"grad_norm": 0.3540976396926151,
"learning_rate": 8.440163195906959e-06,
"loss": 0.0946,
"step": 183
},
{
"epoch": 0.519774011299435,
"grad_norm": 0.33467249667837795,
"learning_rate": 8.424029168473829e-06,
"loss": 0.1341,
"step": 184
},
{
"epoch": 0.5225988700564972,
"grad_norm": 0.3582483372686221,
"learning_rate": 8.407827723955287e-06,
"loss": 0.101,
"step": 185
},
{
"epoch": 0.5254237288135594,
"grad_norm": 0.3871399637156461,
"learning_rate": 8.391559181348081e-06,
"loss": 0.1368,
"step": 186
},
{
"epoch": 0.5282485875706214,
"grad_norm": 0.44494561948427913,
"learning_rate": 8.375223860970078e-06,
"loss": 0.1542,
"step": 187
},
{
"epoch": 0.5310734463276836,
"grad_norm": 0.342778980721638,
"learning_rate": 8.358822084453964e-06,
"loss": 0.1343,
"step": 188
},
{
"epoch": 0.5338983050847458,
"grad_norm": 0.3190954383119015,
"learning_rate": 8.342354174740904e-06,
"loss": 0.1217,
"step": 189
},
{
"epoch": 0.536723163841808,
"grad_norm": 0.32918038764737034,
"learning_rate": 8.325820456074181e-06,
"loss": 0.1158,
"step": 190
},
{
"epoch": 0.53954802259887,
"grad_norm": 0.3879838615458632,
"learning_rate": 8.309221253992825e-06,
"loss": 0.1256,
"step": 191
},
{
"epoch": 0.5423728813559322,
"grad_norm": 0.35618247459755487,
"learning_rate": 8.292556895325195e-06,
"loss": 0.1099,
"step": 192
},
{
"epoch": 0.5451977401129944,
"grad_norm": 0.3234827304096444,
"learning_rate": 8.275827708182536e-06,
"loss": 0.1284,
"step": 193
},
{
"epoch": 0.5480225988700564,
"grad_norm": 0.34766152172703946,
"learning_rate": 8.259034021952537e-06,
"loss": 0.1356,
"step": 194
},
{
"epoch": 0.5508474576271186,
"grad_norm": 0.36732776065701966,
"learning_rate": 8.242176167292827e-06,
"loss": 0.1151,
"step": 195
},
{
"epoch": 0.5536723163841808,
"grad_norm": 0.33889281731020826,
"learning_rate": 8.225254476124479e-06,
"loss": 0.1063,
"step": 196
},
{
"epoch": 0.556497175141243,
"grad_norm": 0.3193880715488108,
"learning_rate": 8.208269281625466e-06,
"loss": 0.1276,
"step": 197
},
{
"epoch": 0.559322033898305,
"grad_norm": 0.3204272104884364,
"learning_rate": 8.191220918224102e-06,
"loss": 0.1223,
"step": 198
},
{
"epoch": 0.5621468926553672,
"grad_norm": 0.2837424255636162,
"learning_rate": 8.174109721592463e-06,
"loss": 0.1004,
"step": 199
},
{
"epoch": 0.5649717514124294,
"grad_norm": 0.37977501376826955,
"learning_rate": 8.156936028639768e-06,
"loss": 0.1343,
"step": 200
},
{
"epoch": 0.5677966101694916,
"grad_norm": 0.3802933186191023,
"learning_rate": 8.13970017750576e-06,
"loss": 0.1229,
"step": 201
},
{
"epoch": 0.5706214689265536,
"grad_norm": 0.3336231226315229,
"learning_rate": 8.12240250755403e-06,
"loss": 0.1071,
"step": 202
},
{
"epoch": 0.5734463276836158,
"grad_norm": 0.31611617409721354,
"learning_rate": 8.10504335936535e-06,
"loss": 0.1315,
"step": 203
},
{
"epoch": 0.576271186440678,
"grad_norm": 0.37986304879280974,
"learning_rate": 8.08762307473096e-06,
"loss": 0.1488,
"step": 204
},
{
"epoch": 0.5790960451977402,
"grad_norm": 0.47382310798593613,
"learning_rate": 8.07014199664584e-06,
"loss": 0.1199,
"step": 205
},
{
"epoch": 0.5819209039548022,
"grad_norm": 0.328423980785905,
"learning_rate": 8.052600469301958e-06,
"loss": 0.1094,
"step": 206
},
{
"epoch": 0.5847457627118644,
"grad_norm": 0.36866153232988097,
"learning_rate": 8.03499883808149e-06,
"loss": 0.1215,
"step": 207
},
{
"epoch": 0.5875706214689266,
"grad_norm": 0.4258404561855114,
"learning_rate": 8.01733744955002e-06,
"loss": 0.1728,
"step": 208
},
{
"epoch": 0.5903954802259888,
"grad_norm": 0.4379066754162319,
"learning_rate": 7.999616651449722e-06,
"loss": 0.1334,
"step": 209
},
{
"epoch": 0.5932203389830508,
"grad_norm": 0.3298246566080818,
"learning_rate": 7.981836792692508e-06,
"loss": 0.1321,
"step": 210
},
{
"epoch": 0.596045197740113,
"grad_norm": 0.3213552929669426,
"learning_rate": 7.963998223353154e-06,
"loss": 0.1475,
"step": 211
},
{
"epoch": 0.5988700564971752,
"grad_norm": 0.6800071694421395,
"learning_rate": 7.946101294662418e-06,
"loss": 0.1521,
"step": 212
},
{
"epoch": 0.6016949152542372,
"grad_norm": 0.31394438576238753,
"learning_rate": 7.928146359000117e-06,
"loss": 0.1269,
"step": 213
},
{
"epoch": 0.6045197740112994,
"grad_norm": 0.34115929397266076,
"learning_rate": 7.91013376988819e-06,
"loss": 0.1079,
"step": 214
},
{
"epoch": 0.6073446327683616,
"grad_norm": 0.3070888783690557,
"learning_rate": 7.892063881983736e-06,
"loss": 0.1037,
"step": 215
},
{
"epoch": 0.6101694915254238,
"grad_norm": 0.35233334420808005,
"learning_rate": 7.873937051072037e-06,
"loss": 0.1291,
"step": 216
},
{
"epoch": 0.6129943502824858,
"grad_norm": 0.37926219175637094,
"learning_rate": 7.855753634059543e-06,
"loss": 0.1997,
"step": 217
},
{
"epoch": 0.615819209039548,
"grad_norm": 0.281725523224709,
"learning_rate": 7.83751398896686e-06,
"loss": 0.0946,
"step": 218
},
{
"epoch": 0.6186440677966102,
"grad_norm": 0.33366851056170144,
"learning_rate": 7.81921847492168e-06,
"loss": 0.1355,
"step": 219
},
{
"epoch": 0.6214689265536724,
"grad_norm": 0.4693528685955929,
"learning_rate": 7.80086745215173e-06,
"loss": 0.1235,
"step": 220
},
{
"epoch": 0.6242937853107344,
"grad_norm": 0.348566291026797,
"learning_rate": 7.782461281977668e-06,
"loss": 0.1463,
"step": 221
},
{
"epoch": 0.6271186440677966,
"grad_norm": 0.3432846241066091,
"learning_rate": 7.764000326805967e-06,
"loss": 0.1253,
"step": 222
},
{
"epoch": 0.6299435028248588,
"grad_norm": 0.3907373096790576,
"learning_rate": 7.74548495012179e-06,
"loss": 0.1391,
"step": 223
},
{
"epoch": 0.632768361581921,
"grad_norm": 0.3728412944790509,
"learning_rate": 7.726915516481824e-06,
"loss": 0.122,
"step": 224
},
{
"epoch": 0.635593220338983,
"grad_norm": 0.5220133391248876,
"learning_rate": 7.708292391507105e-06,
"loss": 0.154,
"step": 225
},
{
"epoch": 0.6384180790960452,
"grad_norm": 0.4146492597116334,
"learning_rate": 7.68961594187582e-06,
"loss": 0.1359,
"step": 226
},
{
"epoch": 0.6412429378531074,
"grad_norm": 0.31397589485063837,
"learning_rate": 7.670886535316086e-06,
"loss": 0.1219,
"step": 227
},
{
"epoch": 0.6440677966101694,
"grad_norm": 0.3176441428020906,
"learning_rate": 7.652104540598712e-06,
"loss": 0.1178,
"step": 228
},
{
"epoch": 0.6468926553672316,
"grad_norm": 0.4044788848890745,
"learning_rate": 7.633270327529936e-06,
"loss": 0.0976,
"step": 229
},
{
"epoch": 0.6497175141242938,
"grad_norm": 0.4182319443410884,
"learning_rate": 7.614384266944139e-06,
"loss": 0.1645,
"step": 230
},
{
"epoch": 0.652542372881356,
"grad_norm": 0.456563352937022,
"learning_rate": 7.595446730696554e-06,
"loss": 0.1382,
"step": 231
},
{
"epoch": 0.655367231638418,
"grad_norm": 0.32903524423468056,
"learning_rate": 7.5764580916559405e-06,
"loss": 0.1326,
"step": 232
},
{
"epoch": 0.6581920903954802,
"grad_norm": 0.28365921307195663,
"learning_rate": 7.5574187236972344e-06,
"loss": 0.0935,
"step": 233
},
{
"epoch": 0.6610169491525424,
"grad_norm": 0.33982089630336487,
"learning_rate": 7.5383290016942e-06,
"loss": 0.1454,
"step": 234
},
{
"epoch": 0.6638418079096046,
"grad_norm": 0.3772064426329603,
"learning_rate": 7.519189301512042e-06,
"loss": 0.1138,
"step": 235
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.32618022313934375,
"learning_rate": 7.500000000000001e-06,
"loss": 0.1334,
"step": 236
},
{
"epoch": 0.6694915254237288,
"grad_norm": 0.45333599138089803,
"learning_rate": 7.480761474983943e-06,
"loss": 0.1124,
"step": 237
},
{
"epoch": 0.672316384180791,
"grad_norm": 0.2925984327006227,
"learning_rate": 7.461474105258911e-06,
"loss": 0.1186,
"step": 238
},
{
"epoch": 0.6751412429378532,
"grad_norm": 0.29274180541284806,
"learning_rate": 7.442138270581676e-06,
"loss": 0.1152,
"step": 239
},
{
"epoch": 0.6779661016949152,
"grad_norm": 0.31809376762962216,
"learning_rate": 7.422754351663252e-06,
"loss": 0.1305,
"step": 240
},
{
"epoch": 0.6807909604519774,
"grad_norm": 0.38676571872596516,
"learning_rate": 7.403322730161402e-06,
"loss": 0.1282,
"step": 241
},
{
"epoch": 0.6836158192090396,
"grad_norm": 0.4018879090622138,
"learning_rate": 7.3838437886731264e-06,
"loss": 0.1183,
"step": 242
},
{
"epoch": 0.6864406779661016,
"grad_norm": 0.3776316331740439,
"learning_rate": 7.364317910727128e-06,
"loss": 0.1222,
"step": 243
},
{
"epoch": 0.6892655367231638,
"grad_norm": 0.3824776982755315,
"learning_rate": 7.3447454807762565e-06,
"loss": 0.1428,
"step": 244
},
{
"epoch": 0.692090395480226,
"grad_norm": 0.3047645047687849,
"learning_rate": 7.325126884189948e-06,
"loss": 0.1385,
"step": 245
},
{
"epoch": 0.6949152542372882,
"grad_norm": 0.3314073663562561,
"learning_rate": 7.30546250724663e-06,
"loss": 0.1206,
"step": 246
},
{
"epoch": 0.6977401129943502,
"grad_norm": 0.4400078921981207,
"learning_rate": 7.285752737126117e-06,
"loss": 0.1327,
"step": 247
},
{
"epoch": 0.7005649717514124,
"grad_norm": 0.4934623443358996,
"learning_rate": 7.265997961901987e-06,
"loss": 0.1564,
"step": 248
},
{
"epoch": 0.7033898305084746,
"grad_norm": 0.30484309715193525,
"learning_rate": 7.246198570533944e-06,
"loss": 0.1242,
"step": 249
},
{
"epoch": 0.7062146892655368,
"grad_norm": 0.29354477145041785,
"learning_rate": 7.226354952860157e-06,
"loss": 0.1149,
"step": 250
},
{
"epoch": 0.7090395480225988,
"grad_norm": 0.3505364073788135,
"learning_rate": 7.206467499589584e-06,
"loss": 0.1087,
"step": 251
},
{
"epoch": 0.711864406779661,
"grad_norm": 0.29464691426886963,
"learning_rate": 7.186536602294278e-06,
"loss": 0.1142,
"step": 252
},
{
"epoch": 0.7146892655367232,
"grad_norm": 0.39810478648654263,
"learning_rate": 7.166562653401681e-06,
"loss": 0.1723,
"step": 253
},
{
"epoch": 0.7175141242937854,
"grad_norm": 0.6996654555301565,
"learning_rate": 7.146546046186893e-06,
"loss": 0.1509,
"step": 254
},
{
"epoch": 0.7203389830508474,
"grad_norm": 0.43941978897146655,
"learning_rate": 7.126487174764936e-06,
"loss": 0.1214,
"step": 255
},
{
"epoch": 0.7231638418079096,
"grad_norm": 0.8711552264444874,
"learning_rate": 7.106386434082979e-06,
"loss": 0.1814,
"step": 256
},
{
"epoch": 0.7259887005649718,
"grad_norm": 0.44400959950712626,
"learning_rate": 7.0862442199125836e-06,
"loss": 0.1358,
"step": 257
},
{
"epoch": 0.7288135593220338,
"grad_norm": 0.35970291529551507,
"learning_rate": 7.066060928841891e-06,
"loss": 0.168,
"step": 258
},
{
"epoch": 0.731638418079096,
"grad_norm": 0.3562466168617285,
"learning_rate": 7.0458369582678276e-06,
"loss": 0.1436,
"step": 259
},
{
"epoch": 0.7344632768361582,
"grad_norm": 0.3120380022750767,
"learning_rate": 7.025572706388268e-06,
"loss": 0.1146,
"step": 260
},
{
"epoch": 0.7372881355932204,
"grad_norm": 0.3774265959308726,
"learning_rate": 7.005268572194208e-06,
"loss": 0.1034,
"step": 261
},
{
"epoch": 0.7401129943502824,
"grad_norm": 0.2979654585888925,
"learning_rate": 6.984924955461901e-06,
"loss": 0.1314,
"step": 262
},
{
"epoch": 0.7429378531073446,
"grad_norm": 0.581337456503481,
"learning_rate": 6.964542256744986e-06,
"loss": 0.1417,
"step": 263
},
{
"epoch": 0.7457627118644068,
"grad_norm": 0.38453399260208176,
"learning_rate": 6.944120877366605e-06,
"loss": 0.1564,
"step": 264
},
{
"epoch": 0.748587570621469,
"grad_norm": 0.4514847039771076,
"learning_rate": 6.923661219411494e-06,
"loss": 0.1104,
"step": 265
},
{
"epoch": 0.751412429378531,
"grad_norm": 0.32941464728785497,
"learning_rate": 6.9031636857180795e-06,
"loss": 0.1232,
"step": 266
},
{
"epoch": 0.7542372881355932,
"grad_norm": 0.32342200359362855,
"learning_rate": 6.8826286798705325e-06,
"loss": 0.1298,
"step": 267
},
{
"epoch": 0.7570621468926554,
"grad_norm": 0.31600309169763335,
"learning_rate": 6.86205660619083e-06,
"loss": 0.1052,
"step": 268
},
{
"epoch": 0.7598870056497176,
"grad_norm": 0.3385392307218001,
"learning_rate": 6.841447869730794e-06,
"loss": 0.1078,
"step": 269
},
{
"epoch": 0.7627118644067796,
"grad_norm": 0.3999058208022931,
"learning_rate": 6.820802876264112e-06,
"loss": 0.1002,
"step": 270
},
{
"epoch": 0.7655367231638418,
"grad_norm": 0.3086907716307657,
"learning_rate": 6.800122032278351e-06,
"loss": 0.1057,
"step": 271
},
{
"epoch": 0.768361581920904,
"grad_norm": 0.3019429125850021,
"learning_rate": 6.7794057449669545e-06,
"loss": 0.1224,
"step": 272
},
{
"epoch": 0.7711864406779662,
"grad_norm": 0.3649142358181941,
"learning_rate": 6.758654422221225e-06,
"loss": 0.1229,
"step": 273
},
{
"epoch": 0.7740112994350282,
"grad_norm": 0.3886884309856128,
"learning_rate": 6.7378684726222875e-06,
"loss": 0.1347,
"step": 274
},
{
"epoch": 0.7768361581920904,
"grad_norm": 0.39178721337354694,
"learning_rate": 6.717048305433053e-06,
"loss": 0.1395,
"step": 275
},
{
"epoch": 0.7796610169491526,
"grad_norm": 0.3560643954364327,
"learning_rate": 6.6961943305901515e-06,
"loss": 0.0996,
"step": 276
},
{
"epoch": 0.7824858757062146,
"grad_norm": 0.2987347215445713,
"learning_rate": 6.675306958695874e-06,
"loss": 0.0932,
"step": 277
},
{
"epoch": 0.7853107344632768,
"grad_norm": 0.391815080682088,
"learning_rate": 6.65438660101007e-06,
"loss": 0.1008,
"step": 278
},
{
"epoch": 0.788135593220339,
"grad_norm": 0.36385558646484645,
"learning_rate": 6.633433669442066e-06,
"loss": 0.1477,
"step": 279
},
{
"epoch": 0.7909604519774012,
"grad_norm": 0.3950859179251757,
"learning_rate": 6.612448576542545e-06,
"loss": 0.1546,
"step": 280
},
{
"epoch": 0.7937853107344632,
"grad_norm": 0.3508433770997797,
"learning_rate": 6.59143173549543e-06,
"loss": 0.1226,
"step": 281
},
{
"epoch": 0.7966101694915254,
"grad_norm": 0.31172563185935787,
"learning_rate": 6.570383560109745e-06,
"loss": 0.1159,
"step": 282
},
{
"epoch": 0.7994350282485876,
"grad_norm": 0.3722434789366503,
"learning_rate": 6.549304464811467e-06,
"loss": 0.1718,
"step": 283
},
{
"epoch": 0.8022598870056498,
"grad_norm": 0.5468457727438586,
"learning_rate": 6.52819486463537e-06,
"loss": 0.129,
"step": 284
},
{
"epoch": 0.8050847457627118,
"grad_norm": 0.37527355148100994,
"learning_rate": 6.50705517521685e-06,
"loss": 0.1245,
"step": 285
},
{
"epoch": 0.807909604519774,
"grad_norm": 0.3327457659403298,
"learning_rate": 6.48588581278374e-06,
"loss": 0.1058,
"step": 286
},
{
"epoch": 0.8107344632768362,
"grad_norm": 0.32768437622701796,
"learning_rate": 6.464687194148121e-06,
"loss": 0.1215,
"step": 287
},
{
"epoch": 0.8135593220338984,
"grad_norm": 0.2981605604482326,
"learning_rate": 6.443459736698106e-06,
"loss": 0.107,
"step": 288
},
{
"epoch": 0.8163841807909604,
"grad_norm": 0.39440686793408264,
"learning_rate": 6.422203858389633e-06,
"loss": 0.099,
"step": 289
},
{
"epoch": 0.8192090395480226,
"grad_norm": 0.3079389916506814,
"learning_rate": 6.400919977738222e-06,
"loss": 0.1261,
"step": 290
},
{
"epoch": 0.8220338983050848,
"grad_norm": 0.39230281512992937,
"learning_rate": 6.379608513810753e-06,
"loss": 0.1388,
"step": 291
},
{
"epoch": 0.8248587570621468,
"grad_norm": 0.34403030238315363,
"learning_rate": 6.3582698862171945e-06,
"loss": 0.1144,
"step": 292
},
{
"epoch": 0.827683615819209,
"grad_norm": 0.3554434392149389,
"learning_rate": 6.336904515102355e-06,
"loss": 0.1401,
"step": 293
},
{
"epoch": 0.8305084745762712,
"grad_norm": 0.3903419801304912,
"learning_rate": 6.315512821137606e-06,
"loss": 0.1166,
"step": 294
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.29926598642859675,
"learning_rate": 6.294095225512604e-06,
"loss": 0.1172,
"step": 295
},
{
"epoch": 0.8361581920903954,
"grad_norm": 0.3247406039107872,
"learning_rate": 6.272652149926989e-06,
"loss": 0.1206,
"step": 296
},
{
"epoch": 0.8389830508474576,
"grad_norm": 0.4155920338425029,
"learning_rate": 6.251184016582088e-06,
"loss": 0.1569,
"step": 297
},
{
"epoch": 0.8418079096045198,
"grad_norm": 0.49517100510425244,
"learning_rate": 6.229691248172599e-06,
"loss": 0.1269,
"step": 298
},
{
"epoch": 0.844632768361582,
"grad_norm": 0.3069243773291746,
"learning_rate": 6.208174267878272e-06,
"loss": 0.1039,
"step": 299
},
{
"epoch": 0.847457627118644,
"grad_norm": 0.30008373909078834,
"learning_rate": 6.186633499355576e-06,
"loss": 0.1011,
"step": 300
},
{
"epoch": 0.8502824858757062,
"grad_norm": 0.39498835620426576,
"learning_rate": 6.165069366729347e-06,
"loss": 0.1262,
"step": 301
},
{
"epoch": 0.8531073446327684,
"grad_norm": 0.45259238504149724,
"learning_rate": 6.143482294584459e-06,
"loss": 0.1555,
"step": 302
},
{
"epoch": 0.8559322033898306,
"grad_norm": 0.3111562216471757,
"learning_rate": 6.121872707957441e-06,
"loss": 0.1037,
"step": 303
},
{
"epoch": 0.8587570621468926,
"grad_norm": 0.38394089498308964,
"learning_rate": 6.100241032328125e-06,
"loss": 0.1381,
"step": 304
},
{
"epoch": 0.8615819209039548,
"grad_norm": 0.4084090521249512,
"learning_rate": 6.078587693611258e-06,
"loss": 0.132,
"step": 305
},
{
"epoch": 0.864406779661017,
"grad_norm": 0.32206751376923765,
"learning_rate": 6.056913118148122e-06,
"loss": 0.115,
"step": 306
},
{
"epoch": 0.867231638418079,
"grad_norm": 0.39859711839150824,
"learning_rate": 6.035217732698141e-06,
"loss": 0.0989,
"step": 307
},
{
"epoch": 0.8700564971751412,
"grad_norm": 0.3223901842186738,
"learning_rate": 6.013501964430468e-06,
"loss": 0.1129,
"step": 308
},
{
"epoch": 0.8728813559322034,
"grad_norm": 0.37501991260972894,
"learning_rate": 5.9917662409155896e-06,
"loss": 0.1158,
"step": 309
},
{
"epoch": 0.8757062146892656,
"grad_norm": 0.3680155603064568,
"learning_rate": 5.970010990116892e-06,
"loss": 0.135,
"step": 310
},
{
"epoch": 0.8785310734463276,
"grad_norm": 0.4225399124226897,
"learning_rate": 5.948236640382249e-06,
"loss": 0.1597,
"step": 311
},
{
"epoch": 0.8813559322033898,
"grad_norm": 0.3062593607691136,
"learning_rate": 5.926443620435572e-06,
"loss": 0.1136,
"step": 312
},
{
"epoch": 0.884180790960452,
"grad_norm": 0.39959497875600314,
"learning_rate": 5.904632359368388e-06,
"loss": 0.1177,
"step": 313
},
{
"epoch": 0.8870056497175142,
"grad_norm": 1.0041774505409626,
"learning_rate": 5.8828032866313725e-06,
"loss": 0.1129,
"step": 314
},
{
"epoch": 0.8898305084745762,
"grad_norm": 0.33893621066441904,
"learning_rate": 5.860956832025907e-06,
"loss": 0.1375,
"step": 315
},
{
"epoch": 0.8926553672316384,
"grad_norm": 0.39383272399940145,
"learning_rate": 5.839093425695609e-06,
"loss": 0.1422,
"step": 316
},
{
"epoch": 0.8954802259887006,
"grad_norm": 0.35738567030204327,
"learning_rate": 5.817213498117866e-06,
"loss": 0.1529,
"step": 317
},
{
"epoch": 0.8983050847457628,
"grad_norm": 0.39608069155077835,
"learning_rate": 5.795317480095361e-06,
"loss": 0.1716,
"step": 318
},
{
"epoch": 0.9011299435028248,
"grad_norm": 0.3631332118845927,
"learning_rate": 5.773405802747585e-06,
"loss": 0.1555,
"step": 319
},
{
"epoch": 0.903954802259887,
"grad_norm": 0.3841861480992692,
"learning_rate": 5.751478897502353e-06,
"loss": 0.1894,
"step": 320
},
{
"epoch": 0.9067796610169492,
"grad_norm": 0.29759516462695884,
"learning_rate": 5.729537196087309e-06,
"loss": 0.112,
"step": 321
},
{
"epoch": 0.9096045197740112,
"grad_norm": 0.28809510654440856,
"learning_rate": 5.707581130521424e-06,
"loss": 0.1134,
"step": 322
},
{
"epoch": 0.9124293785310734,
"grad_norm": 0.397848892460871,
"learning_rate": 5.685611133106491e-06,
"loss": 0.1297,
"step": 323
},
{
"epoch": 0.9152542372881356,
"grad_norm": 0.3160261105495359,
"learning_rate": 5.663627636418611e-06,
"loss": 0.1023,
"step": 324
},
{
"epoch": 0.9180790960451978,
"grad_norm": 0.31177416648537104,
"learning_rate": 5.64163107329968e-06,
"loss": 0.1143,
"step": 325
},
{
"epoch": 0.9209039548022598,
"grad_norm": 0.32018560545646063,
"learning_rate": 5.619621876848864e-06,
"loss": 0.0991,
"step": 326
},
{
"epoch": 0.923728813559322,
"grad_norm": 0.2963637468584666,
"learning_rate": 5.597600480414069e-06,
"loss": 0.1292,
"step": 327
},
{
"epoch": 0.9265536723163842,
"grad_norm": 0.44573857637587677,
"learning_rate": 5.575567317583415e-06,
"loss": 0.1217,
"step": 328
},
{
"epoch": 0.9293785310734464,
"grad_norm": 0.39445386690118284,
"learning_rate": 5.553522822176694e-06,
"loss": 0.1684,
"step": 329
},
{
"epoch": 0.9322033898305084,
"grad_norm": 0.5570629744547292,
"learning_rate": 5.531467428236827e-06,
"loss": 0.121,
"step": 330
},
{
"epoch": 0.9350282485875706,
"grad_norm": 0.2927441446558258,
"learning_rate": 5.5094015700213254e-06,
"loss": 0.1199,
"step": 331
},
{
"epoch": 0.9378531073446328,
"grad_norm": 0.31083045505241014,
"learning_rate": 5.4873256819937325e-06,
"loss": 0.1299,
"step": 332
},
{
"epoch": 0.940677966101695,
"grad_norm": 0.3685953460491956,
"learning_rate": 5.465240198815073e-06,
"loss": 0.1432,
"step": 333
},
{
"epoch": 0.943502824858757,
"grad_norm": 0.358140929608912,
"learning_rate": 5.443145555335296e-06,
"loss": 0.1148,
"step": 334
},
{
"epoch": 0.9463276836158192,
"grad_norm": 0.4047657464869157,
"learning_rate": 5.421042186584708e-06,
"loss": 0.1339,
"step": 335
},
{
"epoch": 0.9491525423728814,
"grad_norm": 0.30030457933241933,
"learning_rate": 5.398930527765416e-06,
"loss": 0.1301,
"step": 336
},
{
"epoch": 0.9519774011299436,
"grad_norm": 0.31945028324973423,
"learning_rate": 5.376811014242749e-06,
"loss": 0.1147,
"step": 337
},
{
"epoch": 0.9548022598870056,
"grad_norm": 0.48059842793389146,
"learning_rate": 5.354684081536693e-06,
"loss": 0.1251,
"step": 338
},
{
"epoch": 0.9576271186440678,
"grad_norm": 0.3547595891598906,
"learning_rate": 5.332550165313312e-06,
"loss": 0.1256,
"step": 339
},
{
"epoch": 0.96045197740113,
"grad_norm": 0.4010586374877903,
"learning_rate": 5.31040970137617e-06,
"loss": 0.1521,
"step": 340
},
{
"epoch": 0.963276836158192,
"grad_norm": 0.30749195608479307,
"learning_rate": 5.288263125657757e-06,
"loss": 0.0898,
"step": 341
},
{
"epoch": 0.9661016949152542,
"grad_norm": 0.38150734834003086,
"learning_rate": 5.266110874210893e-06,
"loss": 0.1153,
"step": 342
},
{
"epoch": 0.9689265536723164,
"grad_norm": 0.4214873565398611,
"learning_rate": 5.2439533832001565e-06,
"loss": 0.1148,
"step": 343
},
{
"epoch": 0.9717514124293786,
"grad_norm": 0.6122493487962786,
"learning_rate": 5.221791088893282e-06,
"loss": 0.1104,
"step": 344
},
{
"epoch": 0.9745762711864406,
"grad_norm": 0.3363807115202252,
"learning_rate": 5.199624427652589e-06,
"loss": 0.1223,
"step": 345
},
{
"epoch": 0.9774011299435028,
"grad_norm": 0.4560391650148292,
"learning_rate": 5.177453835926366e-06,
"loss": 0.1279,
"step": 346
},
{
"epoch": 0.980225988700565,
"grad_norm": 0.34518676510036406,
"learning_rate": 5.155279750240302e-06,
"loss": 0.111,
"step": 347
},
{
"epoch": 0.9830508474576272,
"grad_norm": 0.37562743319165254,
"learning_rate": 5.133102607188875e-06,
"loss": 0.1223,
"step": 348
},
{
"epoch": 0.9858757062146892,
"grad_norm": 0.37060789770525393,
"learning_rate": 5.1109228434267585e-06,
"loss": 0.1205,
"step": 349
},
{
"epoch": 0.9887005649717514,
"grad_norm": 0.30512645778837,
"learning_rate": 5.0887408956602316e-06,
"loss": 0.1123,
"step": 350
},
{
"epoch": 0.9915254237288136,
"grad_norm": 0.31734580505159626,
"learning_rate": 5.06655720063857e-06,
"loss": 0.1393,
"step": 351
},
{
"epoch": 0.9943502824858758,
"grad_norm": 0.4252275291418614,
"learning_rate": 5.044372195145455e-06,
"loss": 0.1804,
"step": 352
},
{
"epoch": 0.9971751412429378,
"grad_norm": 0.445405112876313,
"learning_rate": 5.022186315990371e-06,
"loss": 0.1466,
"step": 353
},
{
"epoch": 1.0,
"grad_norm": 0.33905894838175626,
"learning_rate": 5e-06,
"loss": 0.123,
"step": 354
},
{
"epoch": 1.002824858757062,
"grad_norm": 0.2724848502115793,
"learning_rate": 4.97781368400963e-06,
"loss": 0.0967,
"step": 355
},
{
"epoch": 1.0056497175141244,
"grad_norm": 0.25595208945562964,
"learning_rate": 4.9556278048545445e-06,
"loss": 0.0704,
"step": 356
},
{
"epoch": 1.0084745762711864,
"grad_norm": 0.2425492096389186,
"learning_rate": 4.933442799361432e-06,
"loss": 0.0799,
"step": 357
},
{
"epoch": 1.0112994350282485,
"grad_norm": 0.2761823631954267,
"learning_rate": 4.911259104339771e-06,
"loss": 0.0936,
"step": 358
},
{
"epoch": 1.0141242937853108,
"grad_norm": 0.27454279772458723,
"learning_rate": 4.889077156573242e-06,
"loss": 0.1175,
"step": 359
},
{
"epoch": 1.0169491525423728,
"grad_norm": 0.31992512429885583,
"learning_rate": 4.866897392811127e-06,
"loss": 0.0968,
"step": 360
},
{
"epoch": 1.0197740112994351,
"grad_norm": 0.2894306047919742,
"learning_rate": 4.8447202497596975e-06,
"loss": 0.1236,
"step": 361
},
{
"epoch": 1.0225988700564972,
"grad_norm": 0.24920669865954365,
"learning_rate": 4.822546164073635e-06,
"loss": 0.0852,
"step": 362
},
{
"epoch": 1.0254237288135593,
"grad_norm": 0.2952175126202192,
"learning_rate": 4.800375572347414e-06,
"loss": 0.0991,
"step": 363
},
{
"epoch": 1.0282485875706215,
"grad_norm": 0.23148883860994288,
"learning_rate": 4.778208911106718e-06,
"loss": 0.066,
"step": 364
},
{
"epoch": 1.0310734463276836,
"grad_norm": 0.2692480902893908,
"learning_rate": 4.756046616799845e-06,
"loss": 0.0973,
"step": 365
},
{
"epoch": 1.0338983050847457,
"grad_norm": 0.2760736585863661,
"learning_rate": 4.7338891257891085e-06,
"loss": 0.0912,
"step": 366
},
{
"epoch": 1.036723163841808,
"grad_norm": 0.24876270637682962,
"learning_rate": 4.7117368743422435e-06,
"loss": 0.0837,
"step": 367
},
{
"epoch": 1.03954802259887,
"grad_norm": 0.2494244392514283,
"learning_rate": 4.689590298623831e-06,
"loss": 0.0811,
"step": 368
},
{
"epoch": 1.042372881355932,
"grad_norm": 0.24104486268426012,
"learning_rate": 4.667449834686689e-06,
"loss": 0.076,
"step": 369
},
{
"epoch": 1.0451977401129944,
"grad_norm": 0.2789423674939665,
"learning_rate": 4.645315918463308e-06,
"loss": 0.086,
"step": 370
},
{
"epoch": 1.0480225988700564,
"grad_norm": 0.2770971973884279,
"learning_rate": 4.623188985757252e-06,
"loss": 0.089,
"step": 371
},
{
"epoch": 1.0508474576271187,
"grad_norm": 0.28375117683833695,
"learning_rate": 4.601069472234584e-06,
"loss": 0.1046,
"step": 372
},
{
"epoch": 1.0536723163841808,
"grad_norm": 0.23874038173398437,
"learning_rate": 4.578957813415293e-06,
"loss": 0.0657,
"step": 373
},
{
"epoch": 1.0564971751412429,
"grad_norm": 0.27927992062438906,
"learning_rate": 4.556854444664706e-06,
"loss": 0.0823,
"step": 374
},
{
"epoch": 1.0593220338983051,
"grad_norm": 0.25006740348604295,
"learning_rate": 4.534759801184928e-06,
"loss": 0.084,
"step": 375
},
{
"epoch": 1.0621468926553672,
"grad_norm": 0.26172192084359713,
"learning_rate": 4.512674318006268e-06,
"loss": 0.0688,
"step": 376
},
{
"epoch": 1.0649717514124293,
"grad_norm": 0.27984219569057084,
"learning_rate": 4.490598429978676e-06,
"loss": 0.1003,
"step": 377
},
{
"epoch": 1.0677966101694916,
"grad_norm": 0.31355314424830794,
"learning_rate": 4.468532571763174e-06,
"loss": 0.093,
"step": 378
},
{
"epoch": 1.0706214689265536,
"grad_norm": 0.2789313738912747,
"learning_rate": 4.446477177823308e-06,
"loss": 0.0891,
"step": 379
},
{
"epoch": 1.073446327683616,
"grad_norm": 0.2647947856217315,
"learning_rate": 4.424432682416585e-06,
"loss": 0.0657,
"step": 380
},
{
"epoch": 1.076271186440678,
"grad_norm": 0.309205913837631,
"learning_rate": 4.402399519585932e-06,
"loss": 0.091,
"step": 381
},
{
"epoch": 1.07909604519774,
"grad_norm": 0.27983746723387665,
"learning_rate": 4.380378123151139e-06,
"loss": 0.0758,
"step": 382
},
{
"epoch": 1.0819209039548023,
"grad_norm": 0.265317674784406,
"learning_rate": 4.358368926700321e-06,
"loss": 0.0744,
"step": 383
},
{
"epoch": 1.0847457627118644,
"grad_norm": 0.3844212564195145,
"learning_rate": 4.336372363581391e-06,
"loss": 0.1193,
"step": 384
},
{
"epoch": 1.0875706214689265,
"grad_norm": 0.301325518174874,
"learning_rate": 4.314388866893512e-06,
"loss": 0.0954,
"step": 385
},
{
"epoch": 1.0903954802259888,
"grad_norm": 0.2778786659311699,
"learning_rate": 4.292418869478577e-06,
"loss": 0.0791,
"step": 386
},
{
"epoch": 1.0932203389830508,
"grad_norm": 0.2662950349405817,
"learning_rate": 4.270462803912692e-06,
"loss": 0.076,
"step": 387
},
{
"epoch": 1.0960451977401129,
"grad_norm": 0.2796794355932831,
"learning_rate": 4.248521102497649e-06,
"loss": 0.0804,
"step": 388
},
{
"epoch": 1.0988700564971752,
"grad_norm": 0.2768998598394203,
"learning_rate": 4.226594197252417e-06,
"loss": 0.0834,
"step": 389
},
{
"epoch": 1.1016949152542372,
"grad_norm": 0.27374836470708835,
"learning_rate": 4.204682519904641e-06,
"loss": 0.0718,
"step": 390
},
{
"epoch": 1.1045197740112995,
"grad_norm": 0.3445589105129125,
"learning_rate": 4.182786501882135e-06,
"loss": 0.1162,
"step": 391
},
{
"epoch": 1.1073446327683616,
"grad_norm": 0.34154494685980724,
"learning_rate": 4.160906574304392e-06,
"loss": 0.0821,
"step": 392
},
{
"epoch": 1.1101694915254237,
"grad_norm": 0.28827252450009566,
"learning_rate": 4.139043167974096e-06,
"loss": 0.0789,
"step": 393
},
{
"epoch": 1.112994350282486,
"grad_norm": 0.29707210161706604,
"learning_rate": 4.117196713368629e-06,
"loss": 0.0826,
"step": 394
},
{
"epoch": 1.115819209039548,
"grad_norm": 0.27591994189237984,
"learning_rate": 4.095367640631614e-06,
"loss": 0.0703,
"step": 395
},
{
"epoch": 1.11864406779661,
"grad_norm": 0.31721715937650236,
"learning_rate": 4.073556379564429e-06,
"loss": 0.0741,
"step": 396
},
{
"epoch": 1.1214689265536724,
"grad_norm": 0.2809918490407584,
"learning_rate": 4.051763359617753e-06,
"loss": 0.0768,
"step": 397
},
{
"epoch": 1.1242937853107344,
"grad_norm": 0.3012659075431546,
"learning_rate": 4.0299890098831096e-06,
"loss": 0.0899,
"step": 398
},
{
"epoch": 1.1271186440677967,
"grad_norm": 0.41884111681760783,
"learning_rate": 4.00823375908441e-06,
"loss": 0.1056,
"step": 399
},
{
"epoch": 1.1299435028248588,
"grad_norm": 0.3072172569806948,
"learning_rate": 3.986498035569533e-06,
"loss": 0.0946,
"step": 400
},
{
"epoch": 1.1327683615819208,
"grad_norm": 0.30733421162176133,
"learning_rate": 3.964782267301861e-06,
"loss": 0.1148,
"step": 401
},
{
"epoch": 1.1355932203389831,
"grad_norm": 0.3062196925146934,
"learning_rate": 3.9430868818518786e-06,
"loss": 0.0939,
"step": 402
},
{
"epoch": 1.1384180790960452,
"grad_norm": 0.31120677637400174,
"learning_rate": 3.921412306388744e-06,
"loss": 0.0907,
"step": 403
},
{
"epoch": 1.1412429378531073,
"grad_norm": 0.29992317179141076,
"learning_rate": 3.899758967671879e-06,
"loss": 0.0936,
"step": 404
},
{
"epoch": 1.1440677966101696,
"grad_norm": 0.3192209928402148,
"learning_rate": 3.8781272920425605e-06,
"loss": 0.0926,
"step": 405
},
{
"epoch": 1.1468926553672316,
"grad_norm": 0.2758355027322586,
"learning_rate": 3.856517705415543e-06,
"loss": 0.0716,
"step": 406
},
{
"epoch": 1.1497175141242937,
"grad_norm": 0.30041138229139885,
"learning_rate": 3.834930633270654e-06,
"loss": 0.0915,
"step": 407
},
{
"epoch": 1.152542372881356,
"grad_norm": 0.2793345364877422,
"learning_rate": 3.813366500644426e-06,
"loss": 0.084,
"step": 408
},
{
"epoch": 1.155367231638418,
"grad_norm": 0.31892388449193476,
"learning_rate": 3.791825732121729e-06,
"loss": 0.0874,
"step": 409
},
{
"epoch": 1.1581920903954803,
"grad_norm": 0.31584149456312116,
"learning_rate": 3.770308751827402e-06,
"loss": 0.0973,
"step": 410
},
{
"epoch": 1.1610169491525424,
"grad_norm": 0.34820980091040554,
"learning_rate": 3.748815983417914e-06,
"loss": 0.1253,
"step": 411
},
{
"epoch": 1.1638418079096045,
"grad_norm": 0.28853430489950965,
"learning_rate": 3.727347850073012e-06,
"loss": 0.0759,
"step": 412
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.3062765715619059,
"learning_rate": 3.705904774487396e-06,
"loss": 0.0933,
"step": 413
},
{
"epoch": 1.1694915254237288,
"grad_norm": 0.32486365661487987,
"learning_rate": 3.6844871788623946e-06,
"loss": 0.0911,
"step": 414
},
{
"epoch": 1.1723163841807909,
"grad_norm": 0.3026538149404601,
"learning_rate": 3.6630954848976472e-06,
"loss": 0.0942,
"step": 415
},
{
"epoch": 1.1751412429378532,
"grad_norm": 0.3953970274281564,
"learning_rate": 3.641730113782807e-06,
"loss": 0.0779,
"step": 416
},
{
"epoch": 1.1779661016949152,
"grad_norm": 0.2978638146695922,
"learning_rate": 3.6203914861892483e-06,
"loss": 0.0907,
"step": 417
},
{
"epoch": 1.1807909604519775,
"grad_norm": 0.29056550406150716,
"learning_rate": 3.5990800222617774e-06,
"loss": 0.0754,
"step": 418
},
{
"epoch": 1.1836158192090396,
"grad_norm": 0.2519599426657091,
"learning_rate": 3.577796141610369e-06,
"loss": 0.0632,
"step": 419
},
{
"epoch": 1.1864406779661016,
"grad_norm": 0.2867955638349113,
"learning_rate": 3.5565402633018963e-06,
"loss": 0.0854,
"step": 420
},
{
"epoch": 1.189265536723164,
"grad_norm": 0.27040139234217375,
"learning_rate": 3.535312805851881e-06,
"loss": 0.0676,
"step": 421
},
{
"epoch": 1.192090395480226,
"grad_norm": 0.3332989661333647,
"learning_rate": 3.5141141872162613e-06,
"loss": 0.1127,
"step": 422
},
{
"epoch": 1.194915254237288,
"grad_norm": 0.309745904183909,
"learning_rate": 3.4929448247831523e-06,
"loss": 0.0917,
"step": 423
},
{
"epoch": 1.1977401129943503,
"grad_norm": 0.45096384475777856,
"learning_rate": 3.4718051353646304e-06,
"loss": 0.1173,
"step": 424
},
{
"epoch": 1.2005649717514124,
"grad_norm": 0.30878271329326906,
"learning_rate": 3.4506955351885346e-06,
"loss": 0.0919,
"step": 425
},
{
"epoch": 1.2033898305084745,
"grad_norm": 0.320682805049876,
"learning_rate": 3.4296164398902576e-06,
"loss": 0.0922,
"step": 426
},
{
"epoch": 1.2062146892655368,
"grad_norm": 0.30581016394054344,
"learning_rate": 3.408568264504571e-06,
"loss": 0.0809,
"step": 427
},
{
"epoch": 1.2090395480225988,
"grad_norm": 0.2747339191019564,
"learning_rate": 3.387551423457456e-06,
"loss": 0.0802,
"step": 428
},
{
"epoch": 1.211864406779661,
"grad_norm": 0.552467904262321,
"learning_rate": 3.366566330557935e-06,
"loss": 0.1036,
"step": 429
},
{
"epoch": 1.2146892655367232,
"grad_norm": 0.32041597951648537,
"learning_rate": 3.345613398989932e-06,
"loss": 0.0849,
"step": 430
},
{
"epoch": 1.2175141242937852,
"grad_norm": 0.32164554989021144,
"learning_rate": 3.324693041304128e-06,
"loss": 0.0901,
"step": 431
},
{
"epoch": 1.2203389830508475,
"grad_norm": 0.297941242679515,
"learning_rate": 3.3038056694098485e-06,
"loss": 0.0857,
"step": 432
},
{
"epoch": 1.2231638418079096,
"grad_norm": 0.29988786294219155,
"learning_rate": 3.2829516945669493e-06,
"loss": 0.0658,
"step": 433
},
{
"epoch": 1.2259887005649717,
"grad_norm": 0.28671491672159266,
"learning_rate": 3.262131527377715e-06,
"loss": 0.0825,
"step": 434
},
{
"epoch": 1.228813559322034,
"grad_norm": 0.29202073423769753,
"learning_rate": 3.241345577778775e-06,
"loss": 0.0793,
"step": 435
},
{
"epoch": 1.231638418079096,
"grad_norm": 0.26686163546586056,
"learning_rate": 3.220594255033046e-06,
"loss": 0.0621,
"step": 436
},
{
"epoch": 1.2344632768361583,
"grad_norm": 0.2713137149878859,
"learning_rate": 3.1998779677216508e-06,
"loss": 0.0731,
"step": 437
},
{
"epoch": 1.2372881355932204,
"grad_norm": 0.2775500944738776,
"learning_rate": 3.1791971237358893e-06,
"loss": 0.0734,
"step": 438
},
{
"epoch": 1.2401129943502824,
"grad_norm": 0.28779045523216457,
"learning_rate": 3.1585521302692073e-06,
"loss": 0.0924,
"step": 439
},
{
"epoch": 1.2429378531073447,
"grad_norm": 0.3012961352670417,
"learning_rate": 3.1379433938091695e-06,
"loss": 0.0977,
"step": 440
},
{
"epoch": 1.2457627118644068,
"grad_norm": 0.2790213382568145,
"learning_rate": 3.117371320129469e-06,
"loss": 0.0638,
"step": 441
},
{
"epoch": 1.2485875706214689,
"grad_norm": 0.289333568849024,
"learning_rate": 3.0968363142819226e-06,
"loss": 0.0835,
"step": 442
},
{
"epoch": 1.2514124293785311,
"grad_norm": 0.27842001426208673,
"learning_rate": 3.076338780588507e-06,
"loss": 0.0744,
"step": 443
},
{
"epoch": 1.2542372881355932,
"grad_norm": 0.29218223586498643,
"learning_rate": 3.0558791226333974e-06,
"loss": 0.084,
"step": 444
},
{
"epoch": 1.2570621468926553,
"grad_norm": 0.27223825874055874,
"learning_rate": 3.035457743255016e-06,
"loss": 0.0836,
"step": 445
},
{
"epoch": 1.2598870056497176,
"grad_norm": 0.27957551489646953,
"learning_rate": 3.0150750445380995e-06,
"loss": 0.0782,
"step": 446
},
{
"epoch": 1.2627118644067796,
"grad_norm": 0.3122940814886727,
"learning_rate": 2.9947314278057927e-06,
"loss": 0.1053,
"step": 447
},
{
"epoch": 1.2655367231638417,
"grad_norm": 0.3191436186366097,
"learning_rate": 2.9744272936117323e-06,
"loss": 0.1014,
"step": 448
},
{
"epoch": 1.268361581920904,
"grad_norm": 0.2842992416668331,
"learning_rate": 2.954163041732174e-06,
"loss": 0.0749,
"step": 449
},
{
"epoch": 1.271186440677966,
"grad_norm": 0.30400106395182974,
"learning_rate": 2.9339390711581105e-06,
"loss": 0.0887,
"step": 450
},
{
"epoch": 1.274011299435028,
"grad_norm": 0.29506183884480336,
"learning_rate": 2.9137557800874177e-06,
"loss": 0.091,
"step": 451
},
{
"epoch": 1.2768361581920904,
"grad_norm": 0.3144746461494332,
"learning_rate": 2.8936135659170217e-06,
"loss": 0.1059,
"step": 452
},
{
"epoch": 1.2796610169491525,
"grad_norm": 0.2996152337603787,
"learning_rate": 2.8735128252350677e-06,
"loss": 0.0786,
"step": 453
},
{
"epoch": 1.2824858757062148,
"grad_norm": 0.34649915198509984,
"learning_rate": 2.853453953813108e-06,
"loss": 0.0802,
"step": 454
},
{
"epoch": 1.2853107344632768,
"grad_norm": 0.3012239481267599,
"learning_rate": 2.8334373465983216e-06,
"loss": 0.0895,
"step": 455
},
{
"epoch": 1.288135593220339,
"grad_norm": 0.28862860268152546,
"learning_rate": 2.8134633977057236e-06,
"loss": 0.0839,
"step": 456
},
{
"epoch": 1.2909604519774012,
"grad_norm": 0.3084982778477155,
"learning_rate": 2.7935325004104164e-06,
"loss": 0.1009,
"step": 457
},
{
"epoch": 1.2937853107344632,
"grad_norm": 0.3282395905268862,
"learning_rate": 2.7736450471398435e-06,
"loss": 0.0652,
"step": 458
},
{
"epoch": 1.2966101694915255,
"grad_norm": 0.2793458687882274,
"learning_rate": 2.7538014294660564e-06,
"loss": 0.06,
"step": 459
},
{
"epoch": 1.2994350282485876,
"grad_norm": 0.29032428847483527,
"learning_rate": 2.734002038098015e-06,
"loss": 0.0674,
"step": 460
},
{
"epoch": 1.3022598870056497,
"grad_norm": 0.26031067828121474,
"learning_rate": 2.7142472628738846e-06,
"loss": 0.0628,
"step": 461
},
{
"epoch": 1.305084745762712,
"grad_norm": 0.3479546918961412,
"learning_rate": 2.69453749275337e-06,
"loss": 0.105,
"step": 462
},
{
"epoch": 1.307909604519774,
"grad_norm": 0.2777871642249738,
"learning_rate": 2.6748731158100528e-06,
"loss": 0.0733,
"step": 463
},
{
"epoch": 1.310734463276836,
"grad_norm": 0.29860877000280583,
"learning_rate": 2.655254519223746e-06,
"loss": 0.0956,
"step": 464
},
{
"epoch": 1.3135593220338984,
"grad_norm": 0.39509978738969753,
"learning_rate": 2.6356820892728752e-06,
"loss": 0.098,
"step": 465
},
{
"epoch": 1.3163841807909604,
"grad_norm": 0.26842617785571943,
"learning_rate": 2.616156211326875e-06,
"loss": 0.0683,
"step": 466
},
{
"epoch": 1.3192090395480225,
"grad_norm": 0.2638729229718093,
"learning_rate": 2.5966772698386e-06,
"loss": 0.0697,
"step": 467
},
{
"epoch": 1.3220338983050848,
"grad_norm": 0.2885132528975007,
"learning_rate": 2.57724564833675e-06,
"loss": 0.0853,
"step": 468
},
{
"epoch": 1.3248587570621468,
"grad_norm": 0.2909952803458812,
"learning_rate": 2.557861729418326e-06,
"loss": 0.0702,
"step": 469
},
{
"epoch": 1.327683615819209,
"grad_norm": 0.3508969831429909,
"learning_rate": 2.5385258947410908e-06,
"loss": 0.1163,
"step": 470
},
{
"epoch": 1.3305084745762712,
"grad_norm": 0.3137041512371342,
"learning_rate": 2.5192385250160587e-06,
"loss": 0.0791,
"step": 471
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.34011082522993374,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.0863,
"step": 472
},
{
"epoch": 1.3361581920903955,
"grad_norm": 0.31378850435891975,
"learning_rate": 2.4808106984879597e-06,
"loss": 0.1031,
"step": 473
},
{
"epoch": 1.3389830508474576,
"grad_norm": 0.32916522472842985,
"learning_rate": 2.461670998305802e-06,
"loss": 0.1104,
"step": 474
},
{
"epoch": 1.34180790960452,
"grad_norm": 0.2858816116663427,
"learning_rate": 2.4425812763027672e-06,
"loss": 0.082,
"step": 475
},
{
"epoch": 1.344632768361582,
"grad_norm": 0.3311097135552006,
"learning_rate": 2.4235419083440615e-06,
"loss": 0.1001,
"step": 476
},
{
"epoch": 1.347457627118644,
"grad_norm": 0.3212815875251671,
"learning_rate": 2.404553269303448e-06,
"loss": 0.0706,
"step": 477
},
{
"epoch": 1.3502824858757063,
"grad_norm": 0.3059752435224393,
"learning_rate": 2.3856157330558625e-06,
"loss": 0.0858,
"step": 478
},
{
"epoch": 1.3531073446327684,
"grad_norm": 0.3389049114062409,
"learning_rate": 2.366729672470065e-06,
"loss": 0.0853,
"step": 479
},
{
"epoch": 1.3559322033898304,
"grad_norm": 0.32200853396437623,
"learning_rate": 2.3478954594012884e-06,
"loss": 0.11,
"step": 480
},
{
"epoch": 1.3587570621468927,
"grad_norm": 0.30985593467605343,
"learning_rate": 2.329113464683913e-06,
"loss": 0.0925,
"step": 481
},
{
"epoch": 1.3615819209039548,
"grad_norm": 0.29010419081013045,
"learning_rate": 2.310384058124181e-06,
"loss": 0.079,
"step": 482
},
{
"epoch": 1.3644067796610169,
"grad_norm": 0.2868431818032766,
"learning_rate": 2.2917076084928953e-06,
"loss": 0.0691,
"step": 483
},
{
"epoch": 1.3672316384180792,
"grad_norm": 0.32921780136851597,
"learning_rate": 2.273084483518176e-06,
"loss": 0.1029,
"step": 484
},
{
"epoch": 1.3700564971751412,
"grad_norm": 0.3443609157960376,
"learning_rate": 2.25451504987821e-06,
"loss": 0.1094,
"step": 485
},
{
"epoch": 1.3728813559322033,
"grad_norm": 0.3442323075700346,
"learning_rate": 2.2359996731940348e-06,
"loss": 0.1148,
"step": 486
},
{
"epoch": 1.3757062146892656,
"grad_norm": 0.31019876061048274,
"learning_rate": 2.2175387180223333e-06,
"loss": 0.0846,
"step": 487
},
{
"epoch": 1.3785310734463276,
"grad_norm": 0.27898269961579986,
"learning_rate": 2.1991325478482695e-06,
"loss": 0.0858,
"step": 488
},
{
"epoch": 1.3813559322033897,
"grad_norm": 0.30200643340593814,
"learning_rate": 2.1807815250783194e-06,
"loss": 0.0901,
"step": 489
},
{
"epoch": 1.384180790960452,
"grad_norm": 0.28412971697416345,
"learning_rate": 2.162486011033142e-06,
"loss": 0.0649,
"step": 490
},
{
"epoch": 1.387005649717514,
"grad_norm": 0.28849690690010993,
"learning_rate": 2.1442463659404587e-06,
"loss": 0.0734,
"step": 491
},
{
"epoch": 1.3898305084745763,
"grad_norm": 0.2872214027286925,
"learning_rate": 2.1260629489279662e-06,
"loss": 0.0744,
"step": 492
},
{
"epoch": 1.3926553672316384,
"grad_norm": 0.2856113105572892,
"learning_rate": 2.1079361180162657e-06,
"loss": 0.0772,
"step": 493
},
{
"epoch": 1.3954802259887007,
"grad_norm": 0.2917591710852941,
"learning_rate": 2.089866230111813e-06,
"loss": 0.0872,
"step": 494
},
{
"epoch": 1.3983050847457628,
"grad_norm": 0.3156299163799424,
"learning_rate": 2.0718536409998834e-06,
"loss": 0.0755,
"step": 495
},
{
"epoch": 1.4011299435028248,
"grad_norm": 0.3420052483929326,
"learning_rate": 2.053898705337583e-06,
"loss": 0.0833,
"step": 496
},
{
"epoch": 1.4039548022598871,
"grad_norm": 0.3007655048264405,
"learning_rate": 2.0360017766468466e-06,
"loss": 0.0755,
"step": 497
},
{
"epoch": 1.4067796610169492,
"grad_norm": 0.3285257384928867,
"learning_rate": 2.0181632073074925e-06,
"loss": 0.0882,
"step": 498
},
{
"epoch": 1.4096045197740112,
"grad_norm": 0.27048721301742223,
"learning_rate": 2.000383348550279e-06,
"loss": 0.0739,
"step": 499
},
{
"epoch": 1.4124293785310735,
"grad_norm": 0.2984410410038522,
"learning_rate": 1.9826625504499807e-06,
"loss": 0.0954,
"step": 500
},
{
"epoch": 1.4124293785310735,
"eval_loss": 0.130197674036026,
"eval_runtime": 1.5872,
"eval_samples_per_second": 18.271,
"eval_steps_per_second": 5.04,
"step": 500
},
{
"epoch": 1.4152542372881356,
"grad_norm": 0.2898926398978837,
"learning_rate": 1.965001161918513e-06,
"loss": 0.0789,
"step": 501
},
{
"epoch": 1.4180790960451977,
"grad_norm": 0.3465927210704245,
"learning_rate": 1.947399530698043e-06,
"loss": 0.0979,
"step": 502
},
{
"epoch": 1.42090395480226,
"grad_norm": 0.3475682598971018,
"learning_rate": 1.92985800335416e-06,
"loss": 0.0843,
"step": 503
},
{
"epoch": 1.423728813559322,
"grad_norm": 0.3294028190875718,
"learning_rate": 1.912376925269041e-06,
"loss": 0.1121,
"step": 504
},
{
"epoch": 1.426553672316384,
"grad_norm": 0.3498640725406821,
"learning_rate": 1.894956640634652e-06,
"loss": 0.0828,
"step": 505
},
{
"epoch": 1.4293785310734464,
"grad_norm": 0.30225327794900386,
"learning_rate": 1.8775974924459716e-06,
"loss": 0.085,
"step": 506
},
{
"epoch": 1.4322033898305084,
"grad_norm": 0.3931329736136659,
"learning_rate": 1.860299822494241e-06,
"loss": 0.0724,
"step": 507
},
{
"epoch": 1.4350282485875705,
"grad_norm": 0.28157973731776237,
"learning_rate": 1.8430639713602317e-06,
"loss": 0.0658,
"step": 508
},
{
"epoch": 1.4378531073446328,
"grad_norm": 0.28515233111654203,
"learning_rate": 1.8258902784075394e-06,
"loss": 0.0847,
"step": 509
},
{
"epoch": 1.4406779661016949,
"grad_norm": 0.2760261233151599,
"learning_rate": 1.808779081775901e-06,
"loss": 0.066,
"step": 510
},
{
"epoch": 1.4435028248587571,
"grad_norm": 0.2941924396596464,
"learning_rate": 1.7917307183745353e-06,
"loss": 0.0884,
"step": 511
},
{
"epoch": 1.4463276836158192,
"grad_norm": 0.2686743706277037,
"learning_rate": 1.7747455238755223e-06,
"loss": 0.0743,
"step": 512
},
{
"epoch": 1.4491525423728815,
"grad_norm": 0.3151180489919417,
"learning_rate": 1.757823832707175e-06,
"loss": 0.1007,
"step": 513
},
{
"epoch": 1.4519774011299436,
"grad_norm": 0.34222558948663734,
"learning_rate": 1.7409659780474652e-06,
"loss": 0.103,
"step": 514
},
{
"epoch": 1.4548022598870056,
"grad_norm": 0.3402549223734959,
"learning_rate": 1.7241722918174642e-06,
"loss": 0.1213,
"step": 515
},
{
"epoch": 1.457627118644068,
"grad_norm": 0.3145312403253552,
"learning_rate": 1.7074431046748075e-06,
"loss": 0.0969,
"step": 516
},
{
"epoch": 1.46045197740113,
"grad_norm": 0.2896418247469403,
"learning_rate": 1.6907787460071756e-06,
"loss": 0.0862,
"step": 517
},
{
"epoch": 1.463276836158192,
"grad_norm": 0.31049200696233425,
"learning_rate": 1.6741795439258218e-06,
"loss": 0.098,
"step": 518
},
{
"epoch": 1.4661016949152543,
"grad_norm": 0.36636752026678193,
"learning_rate": 1.6576458252590988e-06,
"loss": 0.1391,
"step": 519
},
{
"epoch": 1.4689265536723164,
"grad_norm": 0.27909992931423844,
"learning_rate": 1.641177915546036e-06,
"loss": 0.0744,
"step": 520
},
{
"epoch": 1.4717514124293785,
"grad_norm": 0.2864859483455,
"learning_rate": 1.6247761390299221e-06,
"loss": 0.0898,
"step": 521
},
{
"epoch": 1.4745762711864407,
"grad_norm": 0.4362808575365348,
"learning_rate": 1.6084408186519195e-06,
"loss": 0.0734,
"step": 522
},
{
"epoch": 1.4774011299435028,
"grad_norm": 0.28051391012639115,
"learning_rate": 1.5921722760447144e-06,
"loss": 0.0678,
"step": 523
},
{
"epoch": 1.4802259887005649,
"grad_norm": 0.30037283427959355,
"learning_rate": 1.5759708315261724e-06,
"loss": 0.0932,
"step": 524
},
{
"epoch": 1.4830508474576272,
"grad_norm": 0.2982894826882516,
"learning_rate": 1.5598368040930427e-06,
"loss": 0.0735,
"step": 525
},
{
"epoch": 1.4858757062146892,
"grad_norm": 0.3166739516240939,
"learning_rate": 1.5437705114146735e-06,
"loss": 0.1003,
"step": 526
},
{
"epoch": 1.4887005649717513,
"grad_norm": 0.3288809776102775,
"learning_rate": 1.527772269826749e-06,
"loss": 0.0984,
"step": 527
},
{
"epoch": 1.4915254237288136,
"grad_norm": 0.29168718949906514,
"learning_rate": 1.511842394325077e-06,
"loss": 0.0907,
"step": 528
},
{
"epoch": 1.4943502824858756,
"grad_norm": 0.2993454545746122,
"learning_rate": 1.4959811985593707e-06,
"loss": 0.0648,
"step": 529
},
{
"epoch": 1.497175141242938,
"grad_norm": 0.2901988801448637,
"learning_rate": 1.4801889948270852e-06,
"loss": 0.0843,
"step": 530
},
{
"epoch": 1.5,
"grad_norm": 0.276078183792322,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.0646,
"step": 531
},
{
"epoch": 1.5028248587570623,
"grad_norm": 0.2957096527763229,
"learning_rate": 1.44881280585441e-06,
"loss": 0.0838,
"step": 532
},
{
"epoch": 1.5056497175141241,
"grad_norm": 0.3074950453422565,
"learning_rate": 1.4332294383924034e-06,
"loss": 0.0976,
"step": 533
},
{
"epoch": 1.5084745762711864,
"grad_norm": 0.28111249969205165,
"learning_rate": 1.4177162985084242e-06,
"loss": 0.07,
"step": 534
},
{
"epoch": 1.5112994350282487,
"grad_norm": 0.3040746365690992,
"learning_rate": 1.4022736916469166e-06,
"loss": 0.0675,
"step": 535
},
{
"epoch": 1.5141242937853108,
"grad_norm": 0.2970452295573905,
"learning_rate": 1.3869019218635644e-06,
"loss": 0.0937,
"step": 536
},
{
"epoch": 1.5169491525423728,
"grad_norm": 0.3091154520174239,
"learning_rate": 1.3716012918193206e-06,
"loss": 0.0761,
"step": 537
},
{
"epoch": 1.5197740112994351,
"grad_norm": 0.33856268143824403,
"learning_rate": 1.3563721027744309e-06,
"loss": 0.0941,
"step": 538
},
{
"epoch": 1.5225988700564972,
"grad_norm": 0.2960053858988549,
"learning_rate": 1.3412146545825166e-06,
"loss": 0.0731,
"step": 539
},
{
"epoch": 1.5254237288135593,
"grad_norm": 0.3000001098205527,
"learning_rate": 1.3261292456846648e-06,
"loss": 0.0777,
"step": 540
},
{
"epoch": 1.5282485875706215,
"grad_norm": 0.32275915585442194,
"learning_rate": 1.3111161731035448e-06,
"loss": 0.1028,
"step": 541
},
{
"epoch": 1.5310734463276836,
"grad_norm": 0.28047948839083625,
"learning_rate": 1.2961757324375768e-06,
"loss": 0.0773,
"step": 542
},
{
"epoch": 1.5338983050847457,
"grad_norm": 0.2897511789785588,
"learning_rate": 1.2813082178550929e-06,
"loss": 0.0761,
"step": 543
},
{
"epoch": 1.536723163841808,
"grad_norm": 0.3604669071306025,
"learning_rate": 1.2665139220885615e-06,
"loss": 0.0966,
"step": 544
},
{
"epoch": 1.53954802259887,
"grad_norm": 0.35066125768752815,
"learning_rate": 1.2517931364288133e-06,
"loss": 0.1189,
"step": 545
},
{
"epoch": 1.542372881355932,
"grad_norm": 0.36481440937249643,
"learning_rate": 1.2371461507193077e-06,
"loss": 0.0854,
"step": 546
},
{
"epoch": 1.5451977401129944,
"grad_norm": 0.2705446394892136,
"learning_rate": 1.2225732533504309e-06,
"loss": 0.0681,
"step": 547
},
{
"epoch": 1.5480225988700564,
"grad_norm": 0.2709042435292725,
"learning_rate": 1.2080747312538082e-06,
"loss": 0.0605,
"step": 548
},
{
"epoch": 1.5508474576271185,
"grad_norm": 0.2941674674541573,
"learning_rate": 1.1936508698966664e-06,
"loss": 0.0759,
"step": 549
},
{
"epoch": 1.5536723163841808,
"grad_norm": 0.3301045484204278,
"learning_rate": 1.1793019532762057e-06,
"loss": 0.09,
"step": 550
},
{
"epoch": 1.556497175141243,
"grad_norm": 0.5929461456725253,
"learning_rate": 1.1650282639140066e-06,
"loss": 0.115,
"step": 551
},
{
"epoch": 1.559322033898305,
"grad_norm": 0.29282230586457825,
"learning_rate": 1.1508300828504682e-06,
"loss": 0.068,
"step": 552
},
{
"epoch": 1.5621468926553672,
"grad_norm": 0.290916040961216,
"learning_rate": 1.1367076896392853e-06,
"loss": 0.0759,
"step": 553
},
{
"epoch": 1.5649717514124295,
"grad_norm": 0.3308510796313253,
"learning_rate": 1.122661362341927e-06,
"loss": 0.107,
"step": 554
},
{
"epoch": 1.5677966101694916,
"grad_norm": 0.2902882953470368,
"learning_rate": 1.1086913775221709e-06,
"loss": 0.0817,
"step": 555
},
{
"epoch": 1.5706214689265536,
"grad_norm": 0.25808920224703275,
"learning_rate": 1.0947980102406597e-06,
"loss": 0.063,
"step": 556
},
{
"epoch": 1.573446327683616,
"grad_norm": 0.3042205877096776,
"learning_rate": 1.0809815340494822e-06,
"loss": 0.0755,
"step": 557
},
{
"epoch": 1.576271186440678,
"grad_norm": 0.3051764415373886,
"learning_rate": 1.0672422209867879e-06,
"loss": 0.0652,
"step": 558
},
{
"epoch": 1.57909604519774,
"grad_norm": 0.3146319364999993,
"learning_rate": 1.053580341571428e-06,
"loss": 0.1059,
"step": 559
},
{
"epoch": 1.5819209039548023,
"grad_norm": 0.3027084741124625,
"learning_rate": 1.0399961647976315e-06,
"loss": 0.0812,
"step": 560
},
{
"epoch": 1.5847457627118644,
"grad_norm": 0.3278175149471125,
"learning_rate": 1.0264899581297121e-06,
"loss": 0.1192,
"step": 561
},
{
"epoch": 1.5875706214689265,
"grad_norm": 0.28592160027084834,
"learning_rate": 1.0130619874967983e-06,
"loss": 0.0752,
"step": 562
},
{
"epoch": 1.5903954802259888,
"grad_norm": 0.2930417798367745,
"learning_rate": 9.997125172875943e-07,
"loss": 0.0879,
"step": 563
},
{
"epoch": 1.5932203389830508,
"grad_norm": 0.2704337798545791,
"learning_rate": 9.86441810345183e-07,
"loss": 0.0624,
"step": 564
},
{
"epoch": 1.5960451977401129,
"grad_norm": 0.3166751927355104,
"learning_rate": 9.732501279618388e-07,
"loss": 0.0848,
"step": 565
},
{
"epoch": 1.5988700564971752,
"grad_norm": 0.29461324641929826,
"learning_rate": 9.60137729873898e-07,
"loss": 0.0789,
"step": 566
},
{
"epoch": 1.6016949152542372,
"grad_norm": 0.31189484709881815,
"learning_rate": 9.471048742566313e-07,
"loss": 0.0822,
"step": 567
},
{
"epoch": 1.6045197740112993,
"grad_norm": 0.3466231703998608,
"learning_rate": 9.34151817719166e-07,
"loss": 0.0767,
"step": 568
},
{
"epoch": 1.6073446327683616,
"grad_norm": 0.30675238542761885,
"learning_rate": 9.212788152994367e-07,
"loss": 0.1034,
"step": 569
},
{
"epoch": 1.6101694915254239,
"grad_norm": 0.522761335835565,
"learning_rate": 9.08486120459155e-07,
"loss": 0.1273,
"step": 570
},
{
"epoch": 1.6129943502824857,
"grad_norm": 0.2810136760249764,
"learning_rate": 8.957739850788288e-07,
"loss": 0.073,
"step": 571
},
{
"epoch": 1.615819209039548,
"grad_norm": 0.31293166014889473,
"learning_rate": 8.831426594527976e-07,
"loss": 0.0956,
"step": 572
},
{
"epoch": 1.6186440677966103,
"grad_norm": 0.3408526367512878,
"learning_rate": 8.705923922843041e-07,
"loss": 0.0891,
"step": 573
},
{
"epoch": 1.6214689265536724,
"grad_norm": 0.30421762095488025,
"learning_rate": 8.581234306805969e-07,
"loss": 0.0946,
"step": 574
},
{
"epoch": 1.6242937853107344,
"grad_norm": 0.2940599119195987,
"learning_rate": 8.457360201480702e-07,
"loss": 0.0692,
"step": 575
},
{
"epoch": 1.6271186440677967,
"grad_norm": 0.3002501406760772,
"learning_rate": 8.334304045874248e-07,
"loss": 0.0815,
"step": 576
},
{
"epoch": 1.6299435028248588,
"grad_norm": 0.268425275016888,
"learning_rate": 8.212068262888684e-07,
"loss": 0.0751,
"step": 577
},
{
"epoch": 1.6327683615819208,
"grad_norm": 0.2928986782866679,
"learning_rate": 8.090655259273428e-07,
"loss": 0.0918,
"step": 578
},
{
"epoch": 1.6355932203389831,
"grad_norm": 0.32485035024590025,
"learning_rate": 7.970067425577849e-07,
"loss": 0.0933,
"step": 579
},
{
"epoch": 1.6384180790960452,
"grad_norm": 0.3234267210299417,
"learning_rate": 7.850307136104246e-07,
"loss": 0.0904,
"step": 580
},
{
"epoch": 1.6412429378531073,
"grad_norm": 0.30188930742886005,
"learning_rate": 7.731376748861069e-07,
"loss": 0.0889,
"step": 581
},
{
"epoch": 1.6440677966101696,
"grad_norm": 0.335467078244967,
"learning_rate": 7.613278605516455e-07,
"loss": 0.1325,
"step": 582
},
{
"epoch": 1.6468926553672316,
"grad_norm": 0.3072516075801986,
"learning_rate": 7.4960150313522e-07,
"loss": 0.0783,
"step": 583
},
{
"epoch": 1.6497175141242937,
"grad_norm": 0.3131137120089587,
"learning_rate": 7.379588335217875e-07,
"loss": 0.0995,
"step": 584
},
{
"epoch": 1.652542372881356,
"grad_norm": 0.2914572623071712,
"learning_rate": 7.264000809485483e-07,
"loss": 0.0863,
"step": 585
},
{
"epoch": 1.655367231638418,
"grad_norm": 0.32502736910136926,
"learning_rate": 7.149254730004246e-07,
"loss": 0.1124,
"step": 586
},
{
"epoch": 1.65819209039548,
"grad_norm": 0.326075318059859,
"learning_rate": 7.035352356055786e-07,
"loss": 0.1201,
"step": 587
},
{
"epoch": 1.6610169491525424,
"grad_norm": 0.37300992352749307,
"learning_rate": 6.922295930309691e-07,
"loss": 0.1073,
"step": 588
},
{
"epoch": 1.6638418079096047,
"grad_norm": 0.28876296213713953,
"learning_rate": 6.810087678779353e-07,
"loss": 0.0743,
"step": 589
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.30368795978426233,
"learning_rate": 6.698729810778065e-07,
"loss": 0.0798,
"step": 590
},
{
"epoch": 1.6694915254237288,
"grad_norm": 0.3072791432365542,
"learning_rate": 6.588224518875647e-07,
"loss": 0.0812,
"step": 591
},
{
"epoch": 1.672316384180791,
"grad_norm": 0.3056293727238639,
"learning_rate": 6.478573978855146e-07,
"loss": 0.0684,
"step": 592
},
{
"epoch": 1.6751412429378532,
"grad_norm": 0.30153905844016693,
"learning_rate": 6.369780349670085e-07,
"loss": 0.0779,
"step": 593
},
{
"epoch": 1.6779661016949152,
"grad_norm": 0.2858390899342033,
"learning_rate": 6.261845773401936e-07,
"loss": 0.0713,
"step": 594
},
{
"epoch": 1.6807909604519775,
"grad_norm": 0.30891966645412655,
"learning_rate": 6.154772375217905e-07,
"loss": 0.0837,
"step": 595
},
{
"epoch": 1.6836158192090396,
"grad_norm": 0.29088564996940475,
"learning_rate": 6.048562263329139e-07,
"loss": 0.0825,
"step": 596
},
{
"epoch": 1.6864406779661016,
"grad_norm": 0.29812823046797693,
"learning_rate": 5.943217528949169e-07,
"loss": 0.0927,
"step": 597
},
{
"epoch": 1.689265536723164,
"grad_norm": 0.30652461054045976,
"learning_rate": 5.838740246252794e-07,
"loss": 0.0766,
"step": 598
},
{
"epoch": 1.692090395480226,
"grad_norm": 0.3012630776892009,
"learning_rate": 5.735132472335192e-07,
"loss": 0.0893,
"step": 599
},
{
"epoch": 1.694915254237288,
"grad_norm": 0.30023304223451514,
"learning_rate": 5.632396247171429e-07,
"loss": 0.1049,
"step": 600
},
{
"epoch": 1.6977401129943503,
"grad_norm": 0.3280835461780043,
"learning_rate": 5.530533593576292e-07,
"loss": 0.116,
"step": 601
},
{
"epoch": 1.7005649717514124,
"grad_norm": 0.2762418019865388,
"learning_rate": 5.429546517164486e-07,
"loss": 0.067,
"step": 602
},
{
"epoch": 1.7033898305084745,
"grad_norm": 0.31675842207409677,
"learning_rate": 5.329437006311122e-07,
"loss": 0.0872,
"step": 603
},
{
"epoch": 1.7062146892655368,
"grad_norm": 0.3041486434567392,
"learning_rate": 5.230207032112549e-07,
"loss": 0.0752,
"step": 604
},
{
"epoch": 1.7090395480225988,
"grad_norm": 0.29993780041723445,
"learning_rate": 5.131858548347596e-07,
"loss": 0.0717,
"step": 605
},
{
"epoch": 1.711864406779661,
"grad_norm": 0.30787432406626875,
"learning_rate": 5.034393491439044e-07,
"loss": 0.0802,
"step": 606
},
{
"epoch": 1.7146892655367232,
"grad_norm": 0.28990243338730465,
"learning_rate": 4.93781378041554e-07,
"loss": 0.0871,
"step": 607
},
{
"epoch": 1.7175141242937855,
"grad_norm": 0.3024236161639379,
"learning_rate": 4.842121316873821e-07,
"loss": 0.0855,
"step": 608
},
{
"epoch": 1.7203389830508473,
"grad_norm": 0.3186961475088875,
"learning_rate": 4.747317984941213e-07,
"loss": 0.0875,
"step": 609
},
{
"epoch": 1.7231638418079096,
"grad_norm": 0.2907838927015749,
"learning_rate": 4.653405651238607e-07,
"loss": 0.0908,
"step": 610
},
{
"epoch": 1.725988700564972,
"grad_norm": 0.30763579782876077,
"learning_rate": 4.560386164843639e-07,
"loss": 0.0964,
"step": 611
},
{
"epoch": 1.7288135593220337,
"grad_norm": 0.30987458360594455,
"learning_rate": 4.468261357254339e-07,
"loss": 0.0947,
"step": 612
},
{
"epoch": 1.731638418079096,
"grad_norm": 0.29429416026094735,
"learning_rate": 4.3770330423530626e-07,
"loss": 0.0834,
"step": 613
},
{
"epoch": 1.7344632768361583,
"grad_norm": 0.3063918655948546,
"learning_rate": 4.286703016370719e-07,
"loss": 0.0925,
"step": 614
},
{
"epoch": 1.7372881355932204,
"grad_norm": 0.33329444172148553,
"learning_rate": 4.197273057851464e-07,
"loss": 0.0983,
"step": 615
},
{
"epoch": 1.7401129943502824,
"grad_norm": 0.29709804495802744,
"learning_rate": 4.108744927617669e-07,
"loss": 0.079,
"step": 616
},
{
"epoch": 1.7429378531073447,
"grad_norm": 0.2937681249333296,
"learning_rate": 4.021120368735254e-07,
"loss": 0.088,
"step": 617
},
{
"epoch": 1.7457627118644068,
"grad_norm": 0.33235581919645196,
"learning_rate": 3.934401106479352e-07,
"loss": 0.093,
"step": 618
},
{
"epoch": 1.7485875706214689,
"grad_norm": 0.3052937062176937,
"learning_rate": 3.8485888483003384e-07,
"loss": 0.0987,
"step": 619
},
{
"epoch": 1.7514124293785311,
"grad_norm": 0.2953169881958288,
"learning_rate": 3.763685283790208e-07,
"loss": 0.0861,
"step": 620
},
{
"epoch": 1.7542372881355932,
"grad_norm": 0.39089257775250585,
"learning_rate": 3.679692084649372e-07,
"loss": 0.1092,
"step": 621
},
{
"epoch": 1.7570621468926553,
"grad_norm": 0.298962166079189,
"learning_rate": 3.596610904653652e-07,
"loss": 0.0877,
"step": 622
},
{
"epoch": 1.7598870056497176,
"grad_norm": 0.33068479547784435,
"learning_rate": 3.5144433796217515e-07,
"loss": 0.0868,
"step": 623
},
{
"epoch": 1.7627118644067796,
"grad_norm": 0.29230226311663704,
"learning_rate": 3.433191127383079e-07,
"loss": 0.0786,
"step": 624
},
{
"epoch": 1.7655367231638417,
"grad_norm": 0.342045177865139,
"learning_rate": 3.352855747745859e-07,
"loss": 0.1034,
"step": 625
},
{
"epoch": 1.768361581920904,
"grad_norm": 0.3538547033051671,
"learning_rate": 3.2734388224656575e-07,
"loss": 0.0913,
"step": 626
},
{
"epoch": 1.7711864406779663,
"grad_norm": 0.3022791955228267,
"learning_rate": 3.1949419152142e-07,
"loss": 0.0912,
"step": 627
},
{
"epoch": 1.774011299435028,
"grad_norm": 0.3286478841800299,
"learning_rate": 3.1173665715486076e-07,
"loss": 0.1005,
"step": 628
},
{
"epoch": 1.7768361581920904,
"grad_norm": 0.2914491072414654,
"learning_rate": 3.0407143188809885e-07,
"loss": 0.087,
"step": 629
},
{
"epoch": 1.7796610169491527,
"grad_norm": 0.26788446393967724,
"learning_rate": 2.9649866664483387e-07,
"loss": 0.06,
"step": 630
},
{
"epoch": 1.7824858757062145,
"grad_norm": 0.29391848362521833,
"learning_rate": 2.8901851052828e-07,
"loss": 0.0789,
"step": 631
},
{
"epoch": 1.7853107344632768,
"grad_norm": 0.2778847008048511,
"learning_rate": 2.816311108182368e-07,
"loss": 0.0626,
"step": 632
},
{
"epoch": 1.788135593220339,
"grad_norm": 0.3154659365103027,
"learning_rate": 2.743366129681824e-07,
"loss": 0.101,
"step": 633
},
{
"epoch": 1.7909604519774012,
"grad_norm": 0.31293085126205966,
"learning_rate": 2.671351606024153e-07,
"loss": 0.0762,
"step": 634
},
{
"epoch": 1.7937853107344632,
"grad_norm": 0.47243232664184365,
"learning_rate": 2.6002689551322403e-07,
"loss": 0.1006,
"step": 635
},
{
"epoch": 1.7966101694915255,
"grad_norm": 0.2781939275244853,
"learning_rate": 2.530119576580936e-07,
"loss": 0.0638,
"step": 636
},
{
"epoch": 1.7994350282485876,
"grad_norm": 0.2796358570748197,
"learning_rate": 2.460904851569534e-07,
"loss": 0.0636,
"step": 637
},
{
"epoch": 1.8022598870056497,
"grad_norm": 0.31697163166683606,
"learning_rate": 2.3926261428945386e-07,
"loss": 0.0707,
"step": 638
},
{
"epoch": 1.805084745762712,
"grad_norm": 0.2764809135900223,
"learning_rate": 2.325284794922883e-07,
"loss": 0.0674,
"step": 639
},
{
"epoch": 1.807909604519774,
"grad_norm": 0.3238518566445213,
"learning_rate": 2.2588821335654044e-07,
"loss": 0.0824,
"step": 640
},
{
"epoch": 1.810734463276836,
"grad_norm": 0.34129160389189483,
"learning_rate": 2.1934194662507736e-07,
"loss": 0.0851,
"step": 641
},
{
"epoch": 1.8135593220338984,
"grad_norm": 0.2825014150604838,
"learning_rate": 2.1288980818997272e-07,
"loss": 0.077,
"step": 642
},
{
"epoch": 1.8163841807909604,
"grad_norm": 0.29713126389115424,
"learning_rate": 2.0653192508997222e-07,
"loss": 0.0762,
"step": 643
},
{
"epoch": 1.8192090395480225,
"grad_norm": 0.300338264829181,
"learning_rate": 2.0026842250799038e-07,
"loss": 0.0878,
"step": 644
},
{
"epoch": 1.8220338983050848,
"grad_norm": 1.4166868383475169,
"learning_rate": 1.9409942376864333e-07,
"loss": 0.0867,
"step": 645
},
{
"epoch": 1.8248587570621468,
"grad_norm": 0.25772375662960606,
"learning_rate": 1.8802505033582608e-07,
"loss": 0.0604,
"step": 646
},
{
"epoch": 1.827683615819209,
"grad_norm": 0.31296870168050195,
"learning_rate": 1.8204542181031572e-07,
"loss": 0.0909,
"step": 647
},
{
"epoch": 1.8305084745762712,
"grad_norm": 0.3064057159229424,
"learning_rate": 1.7616065592742038e-07,
"loss": 0.0881,
"step": 648
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.2998465592381362,
"learning_rate": 1.7037086855465902e-07,
"loss": 0.0629,
"step": 649
},
{
"epoch": 1.8361581920903953,
"grad_norm": 0.29682999878586,
"learning_rate": 1.6467617368947918e-07,
"loss": 0.0786,
"step": 650
},
{
"epoch": 1.8389830508474576,
"grad_norm": 0.2781558427742286,
"learning_rate": 1.5907668345701732e-07,
"loss": 0.0818,
"step": 651
},
{
"epoch": 1.84180790960452,
"grad_norm": 0.27914990667274464,
"learning_rate": 1.5357250810788316e-07,
"loss": 0.0739,
"step": 652
},
{
"epoch": 1.844632768361582,
"grad_norm": 0.3438724850417393,
"learning_rate": 1.4816375601599653e-07,
"loss": 0.0723,
"step": 653
},
{
"epoch": 1.847457627118644,
"grad_norm": 0.31828540808245065,
"learning_rate": 1.4285053367645074e-07,
"loss": 0.077,
"step": 654
},
{
"epoch": 1.8502824858757063,
"grad_norm": 0.27972916277751064,
"learning_rate": 1.37632945703412e-07,
"loss": 0.0705,
"step": 655
},
{
"epoch": 1.8531073446327684,
"grad_norm": 0.24716233730443476,
"learning_rate": 1.3251109482806667e-07,
"loss": 0.0489,
"step": 656
},
{
"epoch": 1.8559322033898304,
"grad_norm": 0.308468550353476,
"learning_rate": 1.2748508189659447e-07,
"loss": 0.0866,
"step": 657
},
{
"epoch": 1.8587570621468927,
"grad_norm": 0.3092098443706496,
"learning_rate": 1.2255500586818015e-07,
"loss": 0.1055,
"step": 658
},
{
"epoch": 1.8615819209039548,
"grad_norm": 0.28162343440142174,
"learning_rate": 1.177209638130733e-07,
"loss": 0.072,
"step": 659
},
{
"epoch": 1.8644067796610169,
"grad_norm": 0.3007912961091226,
"learning_rate": 1.1298305091066664e-07,
"loss": 0.0908,
"step": 660
},
{
"epoch": 1.8672316384180792,
"grad_norm": 0.3136447204914414,
"learning_rate": 1.0834136044763188e-07,
"loss": 0.0836,
"step": 661
},
{
"epoch": 1.8700564971751412,
"grad_norm": 0.31357010636279004,
"learning_rate": 1.0379598381607681e-07,
"loss": 0.0807,
"step": 662
},
{
"epoch": 1.8728813559322033,
"grad_norm": 0.3090649577202879,
"learning_rate": 9.93470105117461e-08,
"loss": 0.0949,
"step": 663
},
{
"epoch": 1.8757062146892656,
"grad_norm": 0.30061286344301524,
"learning_rate": 9.499452813226284e-08,
"loss": 0.0832,
"step": 664
},
{
"epoch": 1.8785310734463276,
"grad_norm": 0.3207780749688742,
"learning_rate": 9.073862237539977e-08,
"loss": 0.0922,
"step": 665
},
{
"epoch": 1.8813559322033897,
"grad_norm": 0.35510559900468425,
"learning_rate": 8.657937703739516e-08,
"loss": 0.0989,
"step": 666
},
{
"epoch": 1.884180790960452,
"grad_norm": 0.29122472340879935,
"learning_rate": 8.251687401130137e-08,
"loss": 0.0806,
"step": 667
},
{
"epoch": 1.8870056497175143,
"grad_norm": 0.3387325114735546,
"learning_rate": 7.855119328537109e-08,
"loss": 0.1179,
"step": 668
},
{
"epoch": 1.8898305084745761,
"grad_norm": 0.3838105969146293,
"learning_rate": 7.468241294148471e-08,
"loss": 0.1056,
"step": 669
},
{
"epoch": 1.8926553672316384,
"grad_norm": 0.34521981231042703,
"learning_rate": 7.09106091536127e-08,
"loss": 0.0708,
"step": 670
},
{
"epoch": 1.8954802259887007,
"grad_norm": 0.2860135017170368,
"learning_rate": 6.723585618631456e-08,
"loss": 0.0807,
"step": 671
},
{
"epoch": 1.8983050847457628,
"grad_norm": 0.25713094736882913,
"learning_rate": 6.365822639327724e-08,
"loss": 0.0596,
"step": 672
},
{
"epoch": 1.9011299435028248,
"grad_norm": 0.32493231522498606,
"learning_rate": 6.017779021589065e-08,
"loss": 0.0783,
"step": 673
},
{
"epoch": 1.9039548022598871,
"grad_norm": 0.3195456319023967,
"learning_rate": 5.679461618185944e-08,
"loss": 0.0989,
"step": 674
},
{
"epoch": 1.9067796610169492,
"grad_norm": 0.28832414052354244,
"learning_rate": 5.350877090385731e-08,
"loss": 0.0842,
"step": 675
},
{
"epoch": 1.9096045197740112,
"grad_norm": 0.29862804180466584,
"learning_rate": 5.032031907821089e-08,
"loss": 0.0799,
"step": 676
},
{
"epoch": 1.9124293785310735,
"grad_norm": 0.39261040413818743,
"learning_rate": 4.722932348362852e-08,
"loss": 0.0763,
"step": 677
},
{
"epoch": 1.9152542372881356,
"grad_norm": 0.28345109997512263,
"learning_rate": 4.423584497996458e-08,
"loss": 0.0623,
"step": 678
},
{
"epoch": 1.9180790960451977,
"grad_norm": 0.3730843254054893,
"learning_rate": 4.1339942507018225e-08,
"loss": 0.1051,
"step": 679
},
{
"epoch": 1.92090395480226,
"grad_norm": 0.408781043992154,
"learning_rate": 3.8541673083377086e-08,
"loss": 0.0972,
"step": 680
},
{
"epoch": 1.923728813559322,
"grad_norm": 0.27235202225360894,
"learning_rate": 3.584109180529205e-08,
"loss": 0.078,
"step": 681
},
{
"epoch": 1.926553672316384,
"grad_norm": 0.3249573151531456,
"learning_rate": 3.323825184559204e-08,
"loss": 0.0665,
"step": 682
},
{
"epoch": 1.9293785310734464,
"grad_norm": 0.3917754153996301,
"learning_rate": 3.073320445263817e-08,
"loss": 0.0948,
"step": 683
},
{
"epoch": 1.9322033898305084,
"grad_norm": 0.3436451083397938,
"learning_rate": 2.8325998949314536e-08,
"loss": 0.1001,
"step": 684
},
{
"epoch": 1.9350282485875705,
"grad_norm": 0.2962089436476239,
"learning_rate": 2.6016682732057375e-08,
"loss": 0.092,
"step": 685
},
{
"epoch": 1.9378531073446328,
"grad_norm": 0.31332687230877443,
"learning_rate": 2.3805301269920754e-08,
"loss": 0.0719,
"step": 686
},
{
"epoch": 1.940677966101695,
"grad_norm": 0.29206089331438057,
"learning_rate": 2.1691898103682885e-08,
"loss": 0.0803,
"step": 687
},
{
"epoch": 1.943502824858757,
"grad_norm": 0.29588225232531956,
"learning_rate": 1.9676514844987338e-08,
"loss": 0.0746,
"step": 688
},
{
"epoch": 1.9463276836158192,
"grad_norm": 0.3052178637723924,
"learning_rate": 1.775919117552427e-08,
"loss": 0.0683,
"step": 689
},
{
"epoch": 1.9491525423728815,
"grad_norm": 0.33628210770097194,
"learning_rate": 1.593996484624938e-08,
"loss": 0.0876,
"step": 690
},
{
"epoch": 1.9519774011299436,
"grad_norm": 0.41716535023044066,
"learning_rate": 1.42188716766406e-08,
"loss": 0.0797,
"step": 691
},
{
"epoch": 1.9548022598870056,
"grad_norm": 0.2721830233815887,
"learning_rate": 1.2595945553992572e-08,
"loss": 0.0746,
"step": 692
},
{
"epoch": 1.957627118644068,
"grad_norm": 0.3093867803026701,
"learning_rate": 1.1071218432749942e-08,
"loss": 0.0927,
"step": 693
},
{
"epoch": 1.96045197740113,
"grad_norm": 0.28442945395766916,
"learning_rate": 9.6447203338762e-09,
"loss": 0.0787,
"step": 694
},
{
"epoch": 1.963276836158192,
"grad_norm": 0.30806352104616475,
"learning_rate": 8.316479344266382e-09,
"loss": 0.0888,
"step": 695
},
{
"epoch": 1.9661016949152543,
"grad_norm": 0.366990265292286,
"learning_rate": 7.0865216161902785e-09,
"loss": 0.1067,
"step": 696
},
{
"epoch": 1.9689265536723164,
"grad_norm": 0.37187519570368377,
"learning_rate": 5.954871366779525e-09,
"loss": 0.0814,
"step": 697
},
{
"epoch": 1.9717514124293785,
"grad_norm": 0.3167633384396195,
"learning_rate": 4.921550877550752e-09,
"loss": 0.0687,
"step": 698
},
{
"epoch": 1.9745762711864407,
"grad_norm": 0.301910108787057,
"learning_rate": 3.9865804939659414e-09,
"loss": 0.0748,
"step": 699
},
{
"epoch": 1.9774011299435028,
"grad_norm": 0.32697329312245565,
"learning_rate": 3.1499786250321904e-09,
"loss": 0.1215,
"step": 700
},
{
"epoch": 1.9802259887005649,
"grad_norm": 0.31440661932949104,
"learning_rate": 2.411761742939778e-09,
"loss": 0.0798,
"step": 701
},
{
"epoch": 1.9830508474576272,
"grad_norm": 0.32934297243481625,
"learning_rate": 1.7719443827368677e-09,
"loss": 0.0759,
"step": 702
},
{
"epoch": 1.9858757062146892,
"grad_norm": 0.3018134458199549,
"learning_rate": 1.2305391420458502e-09,
"loss": 0.0935,
"step": 703
},
{
"epoch": 1.9887005649717513,
"grad_norm": 0.2717967961520749,
"learning_rate": 7.875566808107638e-10,
"loss": 0.0621,
"step": 704
},
{
"epoch": 1.9915254237288136,
"grad_norm": 0.30491685297395554,
"learning_rate": 4.4300572109134965e-10,
"loss": 0.0913,
"step": 705
},
{
"epoch": 1.9943502824858759,
"grad_norm": 0.2812227615626389,
"learning_rate": 1.9689304688985667e-10,
"loss": 0.0699,
"step": 706
},
{
"epoch": 1.9971751412429377,
"grad_norm": 0.27084127166875765,
"learning_rate": 4.922350401781461e-11,
"loss": 0.0745,
"step": 707
},
{
"epoch": 2.0,
"grad_norm": 0.29812406633307104,
"learning_rate": 0.0,
"loss": 0.0894,
"step": 708
},
{
"epoch": 2.0,
"step": 708,
"total_flos": 20169300639744.0,
"train_loss": 0.10943256686362675,
"train_runtime": 738.5688,
"train_samples_per_second": 7.666,
"train_steps_per_second": 0.959
}
],
"logging_steps": 1,
"max_steps": 708,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 20169300639744.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}