sbrzz's picture
Upload 13 files
93bd26a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1353055842219929,
"eval_steps": 500,
"global_step": 1180,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 6.7202,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 6.7739,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 6.7015,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 24.833694217574944,
"learning_rate": 3.816793893129771e-06,
"loss": 6.9159,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 24.089403116981405,
"learning_rate": 7.633587786259541e-06,
"loss": 6.8581,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 24.089403116981405,
"learning_rate": 7.633587786259541e-06,
"loss": 6.5905,
"step": 6
},
{
"epoch": 0.0,
"grad_norm": 30.730797406977864,
"learning_rate": 1.1450381679389314e-05,
"loss": 6.9473,
"step": 7
},
{
"epoch": 0.0,
"grad_norm": 39.80648644878442,
"learning_rate": 1.5267175572519083e-05,
"loss": 6.8672,
"step": 8
},
{
"epoch": 0.0,
"grad_norm": 58.83195470368333,
"learning_rate": 1.9083969465648855e-05,
"loss": 6.8771,
"step": 9
},
{
"epoch": 0.0,
"grad_norm": 37.76609702390963,
"learning_rate": 2.2900763358778628e-05,
"loss": 6.8123,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 16.80166973405484,
"learning_rate": 2.6717557251908397e-05,
"loss": 6.4488,
"step": 11
},
{
"epoch": 0.0,
"grad_norm": 25.54197945689145,
"learning_rate": 3.0534351145038166e-05,
"loss": 6.5151,
"step": 12
},
{
"epoch": 0.0,
"grad_norm": 39.55278632647528,
"learning_rate": 3.435114503816794e-05,
"loss": 6.5851,
"step": 13
},
{
"epoch": 0.0,
"grad_norm": 30.02274564294928,
"learning_rate": 3.816793893129771e-05,
"loss": 6.5235,
"step": 14
},
{
"epoch": 0.0,
"grad_norm": 26.152574346690436,
"learning_rate": 4.198473282442748e-05,
"loss": 6.4511,
"step": 15
},
{
"epoch": 0.0,
"grad_norm": 34.33162883163699,
"learning_rate": 4.5801526717557256e-05,
"loss": 6.3687,
"step": 16
},
{
"epoch": 0.0,
"grad_norm": 17.80393923246746,
"learning_rate": 4.9618320610687025e-05,
"loss": 6.4149,
"step": 17
},
{
"epoch": 0.0,
"grad_norm": 15.25400804429858,
"learning_rate": 5.3435114503816794e-05,
"loss": 6.0852,
"step": 18
},
{
"epoch": 0.0,
"grad_norm": 13.035795814576668,
"learning_rate": 5.725190839694656e-05,
"loss": 6.1313,
"step": 19
},
{
"epoch": 0.0,
"grad_norm": 14.788487589977752,
"learning_rate": 6.106870229007633e-05,
"loss": 6.0491,
"step": 20
},
{
"epoch": 0.0,
"grad_norm": 15.87894494754325,
"learning_rate": 6.488549618320611e-05,
"loss": 6.0371,
"step": 21
},
{
"epoch": 0.0,
"grad_norm": 19.93334663657767,
"learning_rate": 6.870229007633588e-05,
"loss": 5.9827,
"step": 22
},
{
"epoch": 0.0,
"grad_norm": 10.586689666709068,
"learning_rate": 7.251908396946565e-05,
"loss": 5.9068,
"step": 23
},
{
"epoch": 0.0,
"grad_norm": 10.586689666709068,
"learning_rate": 7.251908396946565e-05,
"loss": 5.8078,
"step": 24
},
{
"epoch": 0.0,
"grad_norm": 11.009282905400905,
"learning_rate": 7.633587786259542e-05,
"loss": 6.0599,
"step": 25
},
{
"epoch": 0.0,
"grad_norm": 103.37611984633007,
"learning_rate": 8.015267175572518e-05,
"loss": 6.0033,
"step": 26
},
{
"epoch": 0.0,
"grad_norm": 9.093408232269466,
"learning_rate": 8.396946564885496e-05,
"loss": 5.9485,
"step": 27
},
{
"epoch": 0.0,
"grad_norm": 13.409013384441376,
"learning_rate": 8.778625954198472e-05,
"loss": 5.747,
"step": 28
},
{
"epoch": 0.0,
"grad_norm": 20.723480886920953,
"learning_rate": 9.160305343511451e-05,
"loss": 5.9794,
"step": 29
},
{
"epoch": 0.0,
"grad_norm": 27.227889622861333,
"learning_rate": 9.541984732824429e-05,
"loss": 5.9006,
"step": 30
},
{
"epoch": 0.0,
"grad_norm": 26.139568621344836,
"learning_rate": 9.923664122137405e-05,
"loss": 5.5044,
"step": 31
},
{
"epoch": 0.0,
"grad_norm": 14.752230119326473,
"learning_rate": 0.00010305343511450383,
"loss": 5.4819,
"step": 32
},
{
"epoch": 0.0,
"grad_norm": 13.441487864353205,
"learning_rate": 0.00010687022900763359,
"loss": 5.76,
"step": 33
},
{
"epoch": 0.0,
"grad_norm": 19.896976107816972,
"learning_rate": 0.00011068702290076336,
"loss": 5.8905,
"step": 34
},
{
"epoch": 0.0,
"grad_norm": 16.411578435909693,
"learning_rate": 0.00011450381679389313,
"loss": 5.4038,
"step": 35
},
{
"epoch": 0.0,
"grad_norm": 16.411578435909693,
"learning_rate": 0.00011450381679389313,
"loss": 5.4958,
"step": 36
},
{
"epoch": 0.0,
"grad_norm": 16.013488491082,
"learning_rate": 0.0001183206106870229,
"loss": 5.6131,
"step": 37
},
{
"epoch": 0.0,
"grad_norm": 53.19172274365636,
"learning_rate": 0.00012213740458015266,
"loss": 5.324,
"step": 38
},
{
"epoch": 0.0,
"grad_norm": 36.15408253437268,
"learning_rate": 0.00012595419847328244,
"loss": 5.4318,
"step": 39
},
{
"epoch": 0.0,
"grad_norm": 19.271262304467943,
"learning_rate": 0.00012977099236641222,
"loss": 5.6865,
"step": 40
},
{
"epoch": 0.0,
"grad_norm": 7.0078367115137485,
"learning_rate": 0.000133587786259542,
"loss": 5.5618,
"step": 41
},
{
"epoch": 0.0,
"grad_norm": 12.801895272987867,
"learning_rate": 0.00013740458015267177,
"loss": 5.578,
"step": 42
},
{
"epoch": 0.0,
"grad_norm": 9.471273863525738,
"learning_rate": 0.00014122137404580154,
"loss": 5.4528,
"step": 43
},
{
"epoch": 0.01,
"grad_norm": 11.309015734374855,
"learning_rate": 0.0001450381679389313,
"loss": 5.4494,
"step": 44
},
{
"epoch": 0.01,
"grad_norm": 21.597093927953562,
"learning_rate": 0.00014885496183206107,
"loss": 5.5692,
"step": 45
},
{
"epoch": 0.01,
"grad_norm": 6.168922998526033,
"learning_rate": 0.00015267175572519084,
"loss": 5.5889,
"step": 46
},
{
"epoch": 0.01,
"grad_norm": 26.407797807852603,
"learning_rate": 0.00015648854961832062,
"loss": 5.5628,
"step": 47
},
{
"epoch": 0.01,
"grad_norm": 29.41099617504654,
"learning_rate": 0.00016030534351145037,
"loss": 5.6865,
"step": 48
},
{
"epoch": 0.01,
"grad_norm": 54.04515562049684,
"learning_rate": 0.00016412213740458014,
"loss": 5.3337,
"step": 49
},
{
"epoch": 0.01,
"grad_norm": 21.848816944482426,
"learning_rate": 0.00016793893129770992,
"loss": 5.455,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 21.77248137676844,
"learning_rate": 0.0001717557251908397,
"loss": 5.7747,
"step": 51
},
{
"epoch": 0.01,
"grad_norm": 56.97962669273316,
"learning_rate": 0.00017557251908396944,
"loss": 5.8369,
"step": 52
},
{
"epoch": 0.01,
"grad_norm": 191.66821782581252,
"learning_rate": 0.00017938931297709925,
"loss": 5.7323,
"step": 53
},
{
"epoch": 0.01,
"grad_norm": 29.622135366710424,
"learning_rate": 0.00018320610687022902,
"loss": 5.696,
"step": 54
},
{
"epoch": 0.01,
"grad_norm": 24.485759627294552,
"learning_rate": 0.0001870229007633588,
"loss": 5.5594,
"step": 55
},
{
"epoch": 0.01,
"grad_norm": 23.761324220206912,
"learning_rate": 0.00019083969465648857,
"loss": 5.4566,
"step": 56
},
{
"epoch": 0.01,
"grad_norm": 18.30283741545266,
"learning_rate": 0.00019465648854961832,
"loss": 5.5711,
"step": 57
},
{
"epoch": 0.01,
"grad_norm": 46.402196170850495,
"learning_rate": 0.0001984732824427481,
"loss": 5.584,
"step": 58
},
{
"epoch": 0.01,
"grad_norm": 27.29379203075219,
"learning_rate": 0.00020229007633587788,
"loss": 5.1017,
"step": 59
},
{
"epoch": 0.01,
"grad_norm": 25.72050010332052,
"learning_rate": 0.00020610687022900765,
"loss": 5.4472,
"step": 60
},
{
"epoch": 0.01,
"grad_norm": 11.90204071984756,
"learning_rate": 0.0002099236641221374,
"loss": 5.5801,
"step": 61
},
{
"epoch": 0.01,
"grad_norm": 10.964221676367517,
"learning_rate": 0.00021374045801526718,
"loss": 5.3488,
"step": 62
},
{
"epoch": 0.01,
"grad_norm": 26.717989647168515,
"learning_rate": 0.00021755725190839695,
"loss": 5.1711,
"step": 63
},
{
"epoch": 0.01,
"grad_norm": 9.600628953387018,
"learning_rate": 0.00022137404580152673,
"loss": 5.4042,
"step": 64
},
{
"epoch": 0.01,
"grad_norm": 22.344895453828535,
"learning_rate": 0.00022519083969465648,
"loss": 5.0538,
"step": 65
},
{
"epoch": 0.01,
"grad_norm": 12.190617761435066,
"learning_rate": 0.00022900763358778625,
"loss": 5.4556,
"step": 66
},
{
"epoch": 0.01,
"grad_norm": 7.053431452864492,
"learning_rate": 0.00023282442748091603,
"loss": 5.2617,
"step": 67
},
{
"epoch": 0.01,
"grad_norm": 49.09212426058433,
"learning_rate": 0.0002366412213740458,
"loss": 5.4076,
"step": 68
},
{
"epoch": 0.01,
"grad_norm": 38.22133703991094,
"learning_rate": 0.00024045801526717558,
"loss": 5.3227,
"step": 69
},
{
"epoch": 0.01,
"grad_norm": 11.852793468649201,
"learning_rate": 0.00024427480916030533,
"loss": 5.1096,
"step": 70
},
{
"epoch": 0.01,
"grad_norm": 11.55578534578585,
"learning_rate": 0.00024809160305343513,
"loss": 5.1777,
"step": 71
},
{
"epoch": 0.01,
"grad_norm": 28.44407093616628,
"learning_rate": 0.0002519083969465649,
"loss": 5.1845,
"step": 72
},
{
"epoch": 0.01,
"grad_norm": 12.945012628276542,
"learning_rate": 0.00025572519083969463,
"loss": 4.909,
"step": 73
},
{
"epoch": 0.01,
"grad_norm": 9.829628432755785,
"learning_rate": 0.00025954198473282443,
"loss": 5.1863,
"step": 74
},
{
"epoch": 0.01,
"grad_norm": 55.241813577801985,
"learning_rate": 0.0002633587786259542,
"loss": 5.0904,
"step": 75
},
{
"epoch": 0.01,
"grad_norm": 5.074212095135481,
"learning_rate": 0.000267175572519084,
"loss": 5.18,
"step": 76
},
{
"epoch": 0.01,
"grad_norm": 30.34910144604937,
"learning_rate": 0.00027099236641221373,
"loss": 5.5077,
"step": 77
},
{
"epoch": 0.01,
"grad_norm": 94.78455661102247,
"learning_rate": 0.00027480916030534353,
"loss": 5.2978,
"step": 78
},
{
"epoch": 0.01,
"grad_norm": 6.076099688665433,
"learning_rate": 0.0002786259541984733,
"loss": 5.1118,
"step": 79
},
{
"epoch": 0.01,
"grad_norm": 7.9946755435844095,
"learning_rate": 0.0002824427480916031,
"loss": 5.0875,
"step": 80
},
{
"epoch": 0.01,
"grad_norm": 5.431891860428228,
"learning_rate": 0.0002862595419847328,
"loss": 5.1588,
"step": 81
},
{
"epoch": 0.01,
"grad_norm": 4.047652245569124,
"learning_rate": 0.0002900763358778626,
"loss": 5.0777,
"step": 82
},
{
"epoch": 0.01,
"grad_norm": 9.660302962418715,
"learning_rate": 0.0002938931297709924,
"loss": 5.073,
"step": 83
},
{
"epoch": 0.01,
"grad_norm": 99.18039670794278,
"learning_rate": 0.00029770992366412214,
"loss": 5.2711,
"step": 84
},
{
"epoch": 0.01,
"grad_norm": 41.24087771516951,
"learning_rate": 0.00030152671755725194,
"loss": 5.2529,
"step": 85
},
{
"epoch": 0.01,
"grad_norm": 15.955748113178291,
"learning_rate": 0.0003053435114503817,
"loss": 5.1201,
"step": 86
},
{
"epoch": 0.01,
"grad_norm": 20.04512145951128,
"learning_rate": 0.0003091603053435115,
"loss": 5.5749,
"step": 87
},
{
"epoch": 0.01,
"grad_norm": 63.28989845481338,
"learning_rate": 0.00031297709923664124,
"loss": 5.3352,
"step": 88
},
{
"epoch": 0.01,
"grad_norm": 30.65392260684527,
"learning_rate": 0.000316793893129771,
"loss": 5.2994,
"step": 89
},
{
"epoch": 0.01,
"grad_norm": 19.570933105458376,
"learning_rate": 0.00032061068702290074,
"loss": 5.4428,
"step": 90
},
{
"epoch": 0.01,
"grad_norm": 33.27702030284148,
"learning_rate": 0.00032442748091603054,
"loss": 5.5179,
"step": 91
},
{
"epoch": 0.01,
"grad_norm": 143.06233640693532,
"learning_rate": 0.0003282442748091603,
"loss": 5.4859,
"step": 92
},
{
"epoch": 0.01,
"grad_norm": 17.652574355644063,
"learning_rate": 0.0003320610687022901,
"loss": 5.3578,
"step": 93
},
{
"epoch": 0.01,
"grad_norm": 99.14161014042287,
"learning_rate": 0.00033587786259541984,
"loss": 5.1511,
"step": 94
},
{
"epoch": 0.01,
"grad_norm": 108.46150115295116,
"learning_rate": 0.00033969465648854964,
"loss": 5.332,
"step": 95
},
{
"epoch": 0.01,
"grad_norm": 13.200674925756799,
"learning_rate": 0.0003435114503816794,
"loss": 5.2414,
"step": 96
},
{
"epoch": 0.01,
"grad_norm": 17.42613416743868,
"learning_rate": 0.0003473282442748092,
"loss": 5.0496,
"step": 97
},
{
"epoch": 0.01,
"grad_norm": 13.787168320397248,
"learning_rate": 0.0003511450381679389,
"loss": 5.2121,
"step": 98
},
{
"epoch": 0.01,
"grad_norm": 13.13064175563793,
"learning_rate": 0.0003549618320610687,
"loss": 5.3562,
"step": 99
},
{
"epoch": 0.01,
"grad_norm": 15.209194797874353,
"learning_rate": 0.0003587786259541985,
"loss": 5.1117,
"step": 100
},
{
"epoch": 0.01,
"grad_norm": 48.91476148699009,
"learning_rate": 0.00036259541984732824,
"loss": 5.3374,
"step": 101
},
{
"epoch": 0.01,
"grad_norm": 24.29356061884079,
"learning_rate": 0.00036641221374045805,
"loss": 5.3117,
"step": 102
},
{
"epoch": 0.01,
"grad_norm": 17.452060193355486,
"learning_rate": 0.0003702290076335878,
"loss": 5.27,
"step": 103
},
{
"epoch": 0.01,
"grad_norm": 12.408641021402929,
"learning_rate": 0.0003740458015267176,
"loss": 5.2371,
"step": 104
},
{
"epoch": 0.01,
"grad_norm": 11.216879534678895,
"learning_rate": 0.00037786259541984735,
"loss": 5.3758,
"step": 105
},
{
"epoch": 0.01,
"grad_norm": 14.122037288362222,
"learning_rate": 0.00038167938931297715,
"loss": 5.3233,
"step": 106
},
{
"epoch": 0.01,
"grad_norm": 7.779989122053916,
"learning_rate": 0.00038549618320610684,
"loss": 5.4026,
"step": 107
},
{
"epoch": 0.01,
"grad_norm": 11.626138250979961,
"learning_rate": 0.00038931297709923665,
"loss": 5.0434,
"step": 108
},
{
"epoch": 0.01,
"grad_norm": 7.385310006965435,
"learning_rate": 0.0003931297709923664,
"loss": 5.1588,
"step": 109
},
{
"epoch": 0.01,
"grad_norm": 6.111534750981958,
"learning_rate": 0.0003969465648854962,
"loss": 5.3004,
"step": 110
},
{
"epoch": 0.01,
"grad_norm": 13.108563585041713,
"learning_rate": 0.00040076335877862595,
"loss": 4.9844,
"step": 111
},
{
"epoch": 0.01,
"grad_norm": 12.463164343212153,
"learning_rate": 0.00040458015267175575,
"loss": 5.0383,
"step": 112
},
{
"epoch": 0.01,
"grad_norm": 26.111947637716288,
"learning_rate": 0.0004083969465648855,
"loss": 4.9475,
"step": 113
},
{
"epoch": 0.01,
"grad_norm": 8.324439787028272,
"learning_rate": 0.0004122137404580153,
"loss": 5.0533,
"step": 114
},
{
"epoch": 0.01,
"grad_norm": 9.234173217749362,
"learning_rate": 0.00041603053435114505,
"loss": 4.9712,
"step": 115
},
{
"epoch": 0.01,
"grad_norm": 5.525390780179457,
"learning_rate": 0.0004198473282442748,
"loss": 5.026,
"step": 116
},
{
"epoch": 0.01,
"grad_norm": 5.227610724744892,
"learning_rate": 0.00042366412213740455,
"loss": 5.2434,
"step": 117
},
{
"epoch": 0.01,
"grad_norm": 7.154430996699472,
"learning_rate": 0.00042748091603053435,
"loss": 5.0155,
"step": 118
},
{
"epoch": 0.01,
"grad_norm": 8.76941351196938,
"learning_rate": 0.00043129770992366415,
"loss": 4.86,
"step": 119
},
{
"epoch": 0.01,
"grad_norm": 3.4065374733856135,
"learning_rate": 0.0004351145038167939,
"loss": 4.7745,
"step": 120
},
{
"epoch": 0.01,
"grad_norm": 4.933753082082301,
"learning_rate": 0.0004389312977099237,
"loss": 5.1687,
"step": 121
},
{
"epoch": 0.01,
"grad_norm": 4.17356231729226,
"learning_rate": 0.00044274809160305345,
"loss": 4.8747,
"step": 122
},
{
"epoch": 0.01,
"grad_norm": 3.455904317777208,
"learning_rate": 0.00044656488549618326,
"loss": 4.6564,
"step": 123
},
{
"epoch": 0.01,
"grad_norm": 3.9064455284575668,
"learning_rate": 0.00045038167938931295,
"loss": 4.9689,
"step": 124
},
{
"epoch": 0.01,
"grad_norm": 2.85035028157288,
"learning_rate": 0.00045419847328244275,
"loss": 5.0042,
"step": 125
},
{
"epoch": 0.01,
"grad_norm": 42.12231453713657,
"learning_rate": 0.0004580152671755725,
"loss": 4.9134,
"step": 126
},
{
"epoch": 0.01,
"grad_norm": 33.33935309010155,
"learning_rate": 0.0004618320610687023,
"loss": 4.7722,
"step": 127
},
{
"epoch": 0.01,
"grad_norm": 3.264613929291729,
"learning_rate": 0.00046564885496183206,
"loss": 4.7236,
"step": 128
},
{
"epoch": 0.01,
"grad_norm": 6.284470635260648,
"learning_rate": 0.00046946564885496186,
"loss": 4.9849,
"step": 129
},
{
"epoch": 0.01,
"grad_norm": 2.701662592354433,
"learning_rate": 0.0004732824427480916,
"loss": 4.7388,
"step": 130
},
{
"epoch": 0.02,
"grad_norm": 17.156141458264692,
"learning_rate": 0.0004770992366412214,
"loss": 4.9292,
"step": 131
},
{
"epoch": 0.02,
"grad_norm": 11.896147445442196,
"learning_rate": 0.00048091603053435116,
"loss": 5.0198,
"step": 132
},
{
"epoch": 0.02,
"grad_norm": 3.9435670187222325,
"learning_rate": 0.0004847328244274809,
"loss": 4.6752,
"step": 133
},
{
"epoch": 0.02,
"grad_norm": 7.213241065570718,
"learning_rate": 0.0004885496183206107,
"loss": 4.6556,
"step": 134
},
{
"epoch": 0.02,
"grad_norm": 62.61093309333762,
"learning_rate": 0.0004923664122137404,
"loss": 5.0054,
"step": 135
},
{
"epoch": 0.02,
"grad_norm": 112.46264314904546,
"learning_rate": 0.0004961832061068703,
"loss": 5.1782,
"step": 136
},
{
"epoch": 0.02,
"grad_norm": 41.97202498980487,
"learning_rate": 0.0005,
"loss": 5.2959,
"step": 137
},
{
"epoch": 0.02,
"grad_norm": 11.493483729668776,
"learning_rate": 0.0005038167938931298,
"loss": 5.8508,
"step": 138
},
{
"epoch": 0.02,
"grad_norm": 11.021723732018117,
"learning_rate": 0.0005076335877862596,
"loss": 5.2118,
"step": 139
},
{
"epoch": 0.02,
"grad_norm": 7.006304671598398,
"learning_rate": 0.0005114503816793893,
"loss": 5.3601,
"step": 140
},
{
"epoch": 0.02,
"grad_norm": 7.289194721886362,
"learning_rate": 0.0005152671755725191,
"loss": 5.2464,
"step": 141
},
{
"epoch": 0.02,
"grad_norm": 7.311468819904173,
"learning_rate": 0.0005190839694656489,
"loss": 5.2711,
"step": 142
},
{
"epoch": 0.02,
"grad_norm": 26.398719072271508,
"learning_rate": 0.0005229007633587787,
"loss": 5.289,
"step": 143
},
{
"epoch": 0.02,
"grad_norm": 11.625422270433745,
"learning_rate": 0.0005267175572519084,
"loss": 5.2699,
"step": 144
},
{
"epoch": 0.02,
"grad_norm": 49.995882338347684,
"learning_rate": 0.0005305343511450382,
"loss": 5.295,
"step": 145
},
{
"epoch": 0.02,
"grad_norm": 38.177044422132425,
"learning_rate": 0.000534351145038168,
"loss": 5.1886,
"step": 146
},
{
"epoch": 0.02,
"grad_norm": 18.345238825731563,
"learning_rate": 0.0005381679389312977,
"loss": 5.1197,
"step": 147
},
{
"epoch": 0.02,
"grad_norm": 6.121211476002137,
"learning_rate": 0.0005419847328244275,
"loss": 5.1665,
"step": 148
},
{
"epoch": 0.02,
"grad_norm": 7.033090979387724,
"learning_rate": 0.0005458015267175572,
"loss": 5.2001,
"step": 149
},
{
"epoch": 0.02,
"grad_norm": 8.465524804046279,
"learning_rate": 0.0005496183206106871,
"loss": 5.1638,
"step": 150
},
{
"epoch": 0.02,
"grad_norm": 15.612237680230512,
"learning_rate": 0.0005534351145038168,
"loss": 4.7495,
"step": 151
},
{
"epoch": 0.02,
"grad_norm": 6.388943581119024,
"learning_rate": 0.0005572519083969466,
"loss": 4.9927,
"step": 152
},
{
"epoch": 0.02,
"grad_norm": 4.0182990972402575,
"learning_rate": 0.0005610687022900763,
"loss": 5.0069,
"step": 153
},
{
"epoch": 0.02,
"grad_norm": 4.445692947191389,
"learning_rate": 0.0005648854961832062,
"loss": 4.8021,
"step": 154
},
{
"epoch": 0.02,
"grad_norm": 14.423302455504722,
"learning_rate": 0.0005687022900763359,
"loss": 4.9944,
"step": 155
},
{
"epoch": 0.02,
"grad_norm": 3.863803180947149,
"learning_rate": 0.0005725190839694656,
"loss": 5.0374,
"step": 156
},
{
"epoch": 0.02,
"grad_norm": 4.675446873357981,
"learning_rate": 0.0005763358778625954,
"loss": 4.8162,
"step": 157
},
{
"epoch": 0.02,
"grad_norm": 7.031369179358917,
"learning_rate": 0.0005801526717557252,
"loss": 5.0332,
"step": 158
},
{
"epoch": 0.02,
"grad_norm": 37.0810100366357,
"learning_rate": 0.000583969465648855,
"loss": 4.7682,
"step": 159
},
{
"epoch": 0.02,
"grad_norm": 6.66547076623082,
"learning_rate": 0.0005877862595419848,
"loss": 4.8632,
"step": 160
},
{
"epoch": 0.02,
"grad_norm": 3.3275498864086015,
"learning_rate": 0.0005916030534351145,
"loss": 4.6909,
"step": 161
},
{
"epoch": 0.02,
"grad_norm": 26.188174306741093,
"learning_rate": 0.0005954198473282443,
"loss": 4.6418,
"step": 162
},
{
"epoch": 0.02,
"grad_norm": 10.37007904683383,
"learning_rate": 0.0005992366412213741,
"loss": 4.7376,
"step": 163
},
{
"epoch": 0.02,
"grad_norm": 5.244057220680342,
"learning_rate": 0.0006030534351145039,
"loss": 4.8339,
"step": 164
},
{
"epoch": 0.02,
"grad_norm": 2.5201069409738923,
"learning_rate": 0.0006068702290076335,
"loss": 4.6116,
"step": 165
},
{
"epoch": 0.02,
"grad_norm": 5.974803396981725,
"learning_rate": 0.0006106870229007634,
"loss": 4.5625,
"step": 166
},
{
"epoch": 0.02,
"grad_norm": 26.210473236143425,
"learning_rate": 0.0006145038167938931,
"loss": 4.7733,
"step": 167
},
{
"epoch": 0.02,
"grad_norm": 27.9365658071239,
"learning_rate": 0.000618320610687023,
"loss": 4.911,
"step": 168
},
{
"epoch": 0.02,
"grad_norm": 8.230381951592534,
"learning_rate": 0.0006221374045801526,
"loss": 4.7406,
"step": 169
},
{
"epoch": 0.02,
"grad_norm": 4.732268568832821,
"learning_rate": 0.0006259541984732825,
"loss": 4.6123,
"step": 170
},
{
"epoch": 0.02,
"grad_norm": 9.15487252616632,
"learning_rate": 0.0006297709923664122,
"loss": 4.7896,
"step": 171
},
{
"epoch": 0.02,
"grad_norm": 3.4653332544275686,
"learning_rate": 0.000633587786259542,
"loss": 4.7309,
"step": 172
},
{
"epoch": 0.02,
"grad_norm": 4.850780915920418,
"learning_rate": 0.0006374045801526717,
"loss": 4.7314,
"step": 173
},
{
"epoch": 0.02,
"grad_norm": 7.170622336197695,
"learning_rate": 0.0006412213740458015,
"loss": 4.4504,
"step": 174
},
{
"epoch": 0.02,
"grad_norm": 8.243144623116297,
"learning_rate": 0.0006450381679389313,
"loss": 4.804,
"step": 175
},
{
"epoch": 0.02,
"grad_norm": 2.966695490413193,
"learning_rate": 0.0006488549618320611,
"loss": 4.3688,
"step": 176
},
{
"epoch": 0.02,
"grad_norm": 3.1607265339983766,
"learning_rate": 0.0006526717557251909,
"loss": 4.4259,
"step": 177
},
{
"epoch": 0.02,
"grad_norm": 12.640808983517074,
"learning_rate": 0.0006564885496183206,
"loss": 4.7304,
"step": 178
},
{
"epoch": 0.02,
"grad_norm": 3.6523872935286272,
"learning_rate": 0.0006603053435114504,
"loss": 4.7386,
"step": 179
},
{
"epoch": 0.02,
"grad_norm": 8.30083609660972,
"learning_rate": 0.0006641221374045802,
"loss": 4.5598,
"step": 180
},
{
"epoch": 0.02,
"grad_norm": 3.0623057509893377,
"learning_rate": 0.0006679389312977099,
"loss": 4.5194,
"step": 181
},
{
"epoch": 0.02,
"grad_norm": 4.883454177185916,
"learning_rate": 0.0006717557251908397,
"loss": 4.5887,
"step": 182
},
{
"epoch": 0.02,
"grad_norm": 2.9525920582406457,
"learning_rate": 0.0006755725190839694,
"loss": 4.541,
"step": 183
},
{
"epoch": 0.02,
"grad_norm": 4.270008428820161,
"learning_rate": 0.0006793893129770993,
"loss": 4.8857,
"step": 184
},
{
"epoch": 0.02,
"grad_norm": 5.672713861411733,
"learning_rate": 0.000683206106870229,
"loss": 4.3204,
"step": 185
},
{
"epoch": 0.02,
"grad_norm": 3.761348480989461,
"learning_rate": 0.0006870229007633588,
"loss": 4.3439,
"step": 186
},
{
"epoch": 0.02,
"grad_norm": 2.4326020308075034,
"learning_rate": 0.0006908396946564885,
"loss": 4.7289,
"step": 187
},
{
"epoch": 0.02,
"grad_norm": 7.427959833506058,
"learning_rate": 0.0006946564885496184,
"loss": 4.5812,
"step": 188
},
{
"epoch": 0.02,
"grad_norm": 1.961163714438519,
"learning_rate": 0.0006984732824427481,
"loss": 4.5939,
"step": 189
},
{
"epoch": 0.02,
"grad_norm": 6.895535057551813,
"learning_rate": 0.0007022900763358778,
"loss": 4.5405,
"step": 190
},
{
"epoch": 0.02,
"grad_norm": 8.807431310414946,
"learning_rate": 0.0007061068702290076,
"loss": 4.5149,
"step": 191
},
{
"epoch": 0.02,
"grad_norm": 3.388905199194286,
"learning_rate": 0.0007099236641221374,
"loss": 4.5301,
"step": 192
},
{
"epoch": 0.02,
"grad_norm": 7.913993022689122,
"learning_rate": 0.0007137404580152672,
"loss": 4.3506,
"step": 193
},
{
"epoch": 0.02,
"grad_norm": 2.333063361221933,
"learning_rate": 0.000717557251908397,
"loss": 4.6727,
"step": 194
},
{
"epoch": 0.02,
"grad_norm": 3.2359830428102474,
"learning_rate": 0.0007213740458015267,
"loss": 4.6725,
"step": 195
},
{
"epoch": 0.02,
"grad_norm": 3.1482078742978143,
"learning_rate": 0.0007251908396946565,
"loss": 4.6814,
"step": 196
},
{
"epoch": 0.02,
"grad_norm": 2.593895050022191,
"learning_rate": 0.0007290076335877863,
"loss": 4.3763,
"step": 197
},
{
"epoch": 0.02,
"grad_norm": 2.942514833308484,
"learning_rate": 0.0007328244274809161,
"loss": 4.5224,
"step": 198
},
{
"epoch": 0.02,
"grad_norm": 2.252146175157487,
"learning_rate": 0.0007366412213740457,
"loss": 4.607,
"step": 199
},
{
"epoch": 0.02,
"grad_norm": 4.628967497586178,
"learning_rate": 0.0007404580152671756,
"loss": 4.6547,
"step": 200
},
{
"epoch": 0.02,
"grad_norm": 3.1214679416699287,
"learning_rate": 0.0007442748091603053,
"loss": 4.4936,
"step": 201
},
{
"epoch": 0.02,
"grad_norm": 3.2123452294111003,
"learning_rate": 0.0007480916030534352,
"loss": 4.5136,
"step": 202
},
{
"epoch": 0.02,
"grad_norm": 2.6031227255266622,
"learning_rate": 0.0007519083969465648,
"loss": 4.5817,
"step": 203
},
{
"epoch": 0.02,
"grad_norm": 4.501632369287978,
"learning_rate": 0.0007557251908396947,
"loss": 4.5266,
"step": 204
},
{
"epoch": 0.02,
"grad_norm": 3.3047686153645777,
"learning_rate": 0.0007595419847328244,
"loss": 4.6692,
"step": 205
},
{
"epoch": 0.02,
"grad_norm": 1.90038519113313,
"learning_rate": 0.0007633587786259543,
"loss": 4.6605,
"step": 206
},
{
"epoch": 0.02,
"grad_norm": 3.366071645507772,
"learning_rate": 0.0007671755725190839,
"loss": 4.6692,
"step": 207
},
{
"epoch": 0.02,
"grad_norm": 4.031546045036477,
"learning_rate": 0.0007709923664122137,
"loss": 4.5349,
"step": 208
},
{
"epoch": 0.02,
"grad_norm": 8.290888966118121,
"learning_rate": 0.0007748091603053435,
"loss": 4.5302,
"step": 209
},
{
"epoch": 0.02,
"grad_norm": 2.353501923149862,
"learning_rate": 0.0007786259541984733,
"loss": 4.5482,
"step": 210
},
{
"epoch": 0.02,
"grad_norm": 6.391124795162947,
"learning_rate": 0.000782442748091603,
"loss": 4.6502,
"step": 211
},
{
"epoch": 0.02,
"grad_norm": 2.000245415269884,
"learning_rate": 0.0007862595419847328,
"loss": 4.7538,
"step": 212
},
{
"epoch": 0.02,
"grad_norm": 6.0989354402413465,
"learning_rate": 0.0007900763358778626,
"loss": 4.5299,
"step": 213
},
{
"epoch": 0.02,
"grad_norm": 2.7640114509283844,
"learning_rate": 0.0007938931297709924,
"loss": 4.5933,
"step": 214
},
{
"epoch": 0.02,
"grad_norm": 2.2141741723409734,
"learning_rate": 0.0007977099236641223,
"loss": 4.6185,
"step": 215
},
{
"epoch": 0.02,
"grad_norm": 2.9889777111731024,
"learning_rate": 0.0008015267175572519,
"loss": 4.2585,
"step": 216
},
{
"epoch": 0.02,
"grad_norm": 3.8055064049034963,
"learning_rate": 0.0008053435114503816,
"loss": 4.6684,
"step": 217
},
{
"epoch": 0.02,
"grad_norm": 2.4440615330231577,
"learning_rate": 0.0008091603053435115,
"loss": 4.4236,
"step": 218
},
{
"epoch": 0.03,
"grad_norm": 2.67540754637004,
"learning_rate": 0.0008129770992366412,
"loss": 4.3685,
"step": 219
},
{
"epoch": 0.03,
"grad_norm": 5.793630358099666,
"learning_rate": 0.000816793893129771,
"loss": 4.48,
"step": 220
},
{
"epoch": 0.03,
"grad_norm": 2.483681384184715,
"learning_rate": 0.0008206106870229007,
"loss": 4.4383,
"step": 221
},
{
"epoch": 0.03,
"grad_norm": 2.073067619234986,
"learning_rate": 0.0008244274809160306,
"loss": 4.7067,
"step": 222
},
{
"epoch": 0.03,
"grad_norm": 5.512466666008747,
"learning_rate": 0.0008282442748091604,
"loss": 4.4971,
"step": 223
},
{
"epoch": 0.03,
"grad_norm": 2.4680754308671604,
"learning_rate": 0.0008320610687022901,
"loss": 4.7178,
"step": 224
},
{
"epoch": 0.03,
"grad_norm": 2.350356149063761,
"learning_rate": 0.0008358778625954198,
"loss": 4.5058,
"step": 225
},
{
"epoch": 0.03,
"grad_norm": 1.9235303889832627,
"learning_rate": 0.0008396946564885496,
"loss": 4.4643,
"step": 226
},
{
"epoch": 0.03,
"grad_norm": 3.4372031826778033,
"learning_rate": 0.0008435114503816795,
"loss": 4.5712,
"step": 227
},
{
"epoch": 0.03,
"grad_norm": 1.9601887476121664,
"learning_rate": 0.0008473282442748091,
"loss": 4.4648,
"step": 228
},
{
"epoch": 0.03,
"grad_norm": 2.45846639486102,
"learning_rate": 0.000851145038167939,
"loss": 4.5603,
"step": 229
},
{
"epoch": 0.03,
"grad_norm": 2.7887446648129437,
"learning_rate": 0.0008549618320610687,
"loss": 4.6925,
"step": 230
},
{
"epoch": 0.03,
"grad_norm": 2.2147831858211275,
"learning_rate": 0.0008587786259541986,
"loss": 4.4329,
"step": 231
},
{
"epoch": 0.03,
"grad_norm": 4.3706275121022315,
"learning_rate": 0.0008625954198473283,
"loss": 4.6871,
"step": 232
},
{
"epoch": 0.03,
"grad_norm": 3.7937170135720426,
"learning_rate": 0.0008664122137404581,
"loss": 4.4012,
"step": 233
},
{
"epoch": 0.03,
"grad_norm": 4.435956727807511,
"learning_rate": 0.0008702290076335878,
"loss": 4.3689,
"step": 234
},
{
"epoch": 0.03,
"grad_norm": 5.239393029537955,
"learning_rate": 0.0008740458015267176,
"loss": 4.4296,
"step": 235
},
{
"epoch": 0.03,
"grad_norm": 3.187055953738488,
"learning_rate": 0.0008778625954198474,
"loss": 4.681,
"step": 236
},
{
"epoch": 0.03,
"grad_norm": 2.9083664215185605,
"learning_rate": 0.000881679389312977,
"loss": 4.4031,
"step": 237
},
{
"epoch": 0.03,
"grad_norm": 3.1200850538271188,
"learning_rate": 0.0008854961832061069,
"loss": 4.5064,
"step": 238
},
{
"epoch": 0.03,
"grad_norm": 2.277426952151577,
"learning_rate": 0.0008893129770992367,
"loss": 4.4562,
"step": 239
},
{
"epoch": 0.03,
"grad_norm": 2.588541096844217,
"learning_rate": 0.0008931297709923665,
"loss": 4.4816,
"step": 240
},
{
"epoch": 0.03,
"grad_norm": 2.1713161104056526,
"learning_rate": 0.0008969465648854962,
"loss": 4.2547,
"step": 241
},
{
"epoch": 0.03,
"grad_norm": 1.9979300139791905,
"learning_rate": 0.0009007633587786259,
"loss": 4.4082,
"step": 242
},
{
"epoch": 0.03,
"grad_norm": 7.13986641828722,
"learning_rate": 0.0009045801526717558,
"loss": 4.3108,
"step": 243
},
{
"epoch": 0.03,
"grad_norm": 2.464791404148931,
"learning_rate": 0.0009083969465648855,
"loss": 4.4189,
"step": 244
},
{
"epoch": 0.03,
"grad_norm": 2.234666589131025,
"learning_rate": 0.0009122137404580153,
"loss": 4.4217,
"step": 245
},
{
"epoch": 0.03,
"grad_norm": 4.733346152783746,
"learning_rate": 0.000916030534351145,
"loss": 4.436,
"step": 246
},
{
"epoch": 0.03,
"grad_norm": 1.8815470156640648,
"learning_rate": 0.0009198473282442749,
"loss": 4.3699,
"step": 247
},
{
"epoch": 0.03,
"grad_norm": 2.985621573989371,
"learning_rate": 0.0009236641221374046,
"loss": 4.4429,
"step": 248
},
{
"epoch": 0.03,
"grad_norm": 2.9757159724836653,
"learning_rate": 0.0009274809160305345,
"loss": 4.4512,
"step": 249
},
{
"epoch": 0.03,
"grad_norm": 2.579476705353371,
"learning_rate": 0.0009312977099236641,
"loss": 4.4356,
"step": 250
},
{
"epoch": 0.03,
"grad_norm": 2.9285783053055527,
"learning_rate": 0.0009351145038167939,
"loss": 4.5585,
"step": 251
},
{
"epoch": 0.03,
"grad_norm": 3.65895208962216,
"learning_rate": 0.0009389312977099237,
"loss": 4.2691,
"step": 252
},
{
"epoch": 0.03,
"grad_norm": 1.9673319651551415,
"learning_rate": 0.0009427480916030535,
"loss": 4.5441,
"step": 253
},
{
"epoch": 0.03,
"grad_norm": 3.1238476750284234,
"learning_rate": 0.0009465648854961832,
"loss": 4.4657,
"step": 254
},
{
"epoch": 0.03,
"grad_norm": 1.9438137125018247,
"learning_rate": 0.000950381679389313,
"loss": 4.6594,
"step": 255
},
{
"epoch": 0.03,
"grad_norm": 2.450145616842104,
"learning_rate": 0.0009541984732824428,
"loss": 4.7328,
"step": 256
},
{
"epoch": 0.03,
"grad_norm": 4.407173830430405,
"learning_rate": 0.0009580152671755726,
"loss": 4.1906,
"step": 257
},
{
"epoch": 0.03,
"grad_norm": 4.436698880591971,
"learning_rate": 0.0009618320610687023,
"loss": 4.4834,
"step": 258
},
{
"epoch": 0.03,
"grad_norm": 5.230990693011375,
"learning_rate": 0.0009656488549618321,
"loss": 4.4125,
"step": 259
},
{
"epoch": 0.03,
"grad_norm": 3.3753204853522574,
"learning_rate": 0.0009694656488549618,
"loss": 4.1311,
"step": 260
},
{
"epoch": 0.03,
"grad_norm": 6.025454995306366,
"learning_rate": 0.0009732824427480917,
"loss": 4.5515,
"step": 261
},
{
"epoch": 0.03,
"grad_norm": 6.099556869358559,
"learning_rate": 0.0009770992366412213,
"loss": 4.3692,
"step": 262
},
{
"epoch": 0.03,
"grad_norm": 2.2532819690576127,
"learning_rate": 0.0009809160305343512,
"loss": 4.4647,
"step": 263
},
{
"epoch": 0.03,
"grad_norm": 2.9321564302319585,
"learning_rate": 0.0009847328244274808,
"loss": 4.5391,
"step": 264
},
{
"epoch": 0.03,
"grad_norm": 1.7877794819325101,
"learning_rate": 0.0009885496183206107,
"loss": 4.4656,
"step": 265
},
{
"epoch": 0.03,
"grad_norm": 1.7179336582935352,
"learning_rate": 0.0009923664122137405,
"loss": 4.4662,
"step": 266
},
{
"epoch": 0.03,
"grad_norm": 2.048454145524189,
"learning_rate": 0.0009961832061068704,
"loss": 4.247,
"step": 267
},
{
"epoch": 0.03,
"grad_norm": 2.0370693677273164,
"learning_rate": 0.001,
"loss": 4.4564,
"step": 268
},
{
"epoch": 0.03,
"grad_norm": 2.5222638452498476,
"learning_rate": 0.0009999999655172654,
"loss": 4.2618,
"step": 269
},
{
"epoch": 0.03,
"grad_norm": 2.1108937154982486,
"learning_rate": 0.0009999998620690664,
"loss": 4.5203,
"step": 270
},
{
"epoch": 0.03,
"grad_norm": 1.8392493558241325,
"learning_rate": 0.0009999996896554175,
"loss": 4.2112,
"step": 271
},
{
"epoch": 0.03,
"grad_norm": 2.2938832897333836,
"learning_rate": 0.0009999994482763422,
"loss": 4.3127,
"step": 272
},
{
"epoch": 0.03,
"grad_norm": 1.7529449127976362,
"learning_rate": 0.0009999991379318737,
"loss": 4.3054,
"step": 273
},
{
"epoch": 0.03,
"grad_norm": 1.9574686246354027,
"learning_rate": 0.000999998758622055,
"loss": 4.4122,
"step": 274
},
{
"epoch": 0.03,
"grad_norm": 5.821259062674746,
"learning_rate": 0.0009999983103469385,
"loss": 4.4032,
"step": 275
},
{
"epoch": 0.03,
"grad_norm": 2.4689544587787062,
"learning_rate": 0.0009999977931065857,
"loss": 4.3253,
"step": 276
},
{
"epoch": 0.03,
"grad_norm": 2.2321621725081267,
"learning_rate": 0.0009999972069010686,
"loss": 4.1706,
"step": 277
},
{
"epoch": 0.03,
"grad_norm": 3.4926949882090748,
"learning_rate": 0.0009999965517304673,
"loss": 4.3912,
"step": 278
},
{
"epoch": 0.03,
"grad_norm": 2.360900925114398,
"learning_rate": 0.0009999958275948725,
"loss": 4.3268,
"step": 279
},
{
"epoch": 0.03,
"grad_norm": 2.201926126895682,
"learning_rate": 0.0009999950344943842,
"loss": 4.4753,
"step": 280
},
{
"epoch": 0.03,
"grad_norm": 1.7171916658696942,
"learning_rate": 0.0009999941724291115,
"loss": 4.3981,
"step": 281
},
{
"epoch": 0.03,
"grad_norm": 2.169691498257731,
"learning_rate": 0.0009999932413991737,
"loss": 4.3607,
"step": 282
},
{
"epoch": 0.03,
"grad_norm": 1.6266513074005855,
"learning_rate": 0.0009999922414046986,
"loss": 4.4905,
"step": 283
},
{
"epoch": 0.03,
"grad_norm": 2.7426231842873903,
"learning_rate": 0.0009999911724458248,
"loss": 4.5396,
"step": 284
},
{
"epoch": 0.03,
"grad_norm": 1.5392942285058555,
"learning_rate": 0.0009999900345226994,
"loss": 4.4994,
"step": 285
},
{
"epoch": 0.03,
"grad_norm": 1.678356795108113,
"learning_rate": 0.0009999888276354795,
"loss": 4.4029,
"step": 286
},
{
"epoch": 0.03,
"grad_norm": 2.691200017633891,
"learning_rate": 0.0009999875517843315,
"loss": 4.2561,
"step": 287
},
{
"epoch": 0.03,
"grad_norm": 3.3950536951859154,
"learning_rate": 0.0009999862069694312,
"loss": 4.4023,
"step": 288
},
{
"epoch": 0.03,
"grad_norm": 1.8958335318394337,
"learning_rate": 0.0009999847931909645,
"loss": 4.5222,
"step": 289
},
{
"epoch": 0.03,
"grad_norm": 1.7171986816761662,
"learning_rate": 0.000999983310449126,
"loss": 4.2009,
"step": 290
},
{
"epoch": 0.03,
"grad_norm": 1.7441516633821998,
"learning_rate": 0.0009999817587441203,
"loss": 4.2292,
"step": 291
},
{
"epoch": 0.03,
"grad_norm": 2.8399045457207577,
"learning_rate": 0.0009999801380761615,
"loss": 4.3146,
"step": 292
},
{
"epoch": 0.03,
"grad_norm": 2.5530683875413502,
"learning_rate": 0.0009999784484454734,
"loss": 4.6165,
"step": 293
},
{
"epoch": 0.03,
"grad_norm": 6.893237896818217,
"learning_rate": 0.0009999766898522884,
"loss": 4.4322,
"step": 294
},
{
"epoch": 0.03,
"grad_norm": 1.7278460987247395,
"learning_rate": 0.0009999748622968496,
"loss": 4.246,
"step": 295
},
{
"epoch": 0.03,
"grad_norm": 4.980795698751086,
"learning_rate": 0.000999972965779409,
"loss": 4.257,
"step": 296
},
{
"epoch": 0.03,
"grad_norm": 2.7099296748788064,
"learning_rate": 0.000999971000300228,
"loss": 4.4146,
"step": 297
},
{
"epoch": 0.03,
"grad_norm": 4.015270426069638,
"learning_rate": 0.000999968965859578,
"loss": 4.3122,
"step": 298
},
{
"epoch": 0.03,
"grad_norm": 1.7048017849738346,
"learning_rate": 0.0009999668624577395,
"loss": 4.563,
"step": 299
},
{
"epoch": 0.03,
"grad_norm": 1.890124937059456,
"learning_rate": 0.0009999646900950023,
"loss": 4.6135,
"step": 300
},
{
"epoch": 0.03,
"grad_norm": 1.6240602186049096,
"learning_rate": 0.0009999624487716666,
"loss": 4.3446,
"step": 301
},
{
"epoch": 0.03,
"grad_norm": 1.900172258525495,
"learning_rate": 0.000999960138488041,
"loss": 4.3648,
"step": 302
},
{
"epoch": 0.03,
"grad_norm": 1.6425318766445263,
"learning_rate": 0.0009999577592444443,
"loss": 4.2827,
"step": 303
},
{
"epoch": 0.03,
"grad_norm": 2.413089673147163,
"learning_rate": 0.000999955311041205,
"loss": 4.2912,
"step": 304
},
{
"epoch": 0.03,
"grad_norm": 2.4031038367300814,
"learning_rate": 0.0009999527938786606,
"loss": 4.6092,
"step": 305
},
{
"epoch": 0.04,
"grad_norm": 2.957841858118663,
"learning_rate": 0.0009999502077571581,
"loss": 4.1404,
"step": 306
},
{
"epoch": 0.04,
"grad_norm": 2.9290732269664215,
"learning_rate": 0.0009999475526770545,
"loss": 4.4611,
"step": 307
},
{
"epoch": 0.04,
"grad_norm": 1.6682796108498466,
"learning_rate": 0.0009999448286387158,
"loss": 4.2804,
"step": 308
},
{
"epoch": 0.04,
"grad_norm": 2.8424514884370637,
"learning_rate": 0.0009999420356425178,
"loss": 4.5973,
"step": 309
},
{
"epoch": 0.04,
"grad_norm": 2.322467005072296,
"learning_rate": 0.0009999391736888457,
"loss": 4.2886,
"step": 310
},
{
"epoch": 0.04,
"grad_norm": 1.8235278372086394,
"learning_rate": 0.0009999362427780942,
"loss": 4.1618,
"step": 311
},
{
"epoch": 0.04,
"grad_norm": 2.369434094607981,
"learning_rate": 0.0009999332429106679,
"loss": 4.1849,
"step": 312
},
{
"epoch": 0.04,
"grad_norm": 2.0829626206004206,
"learning_rate": 0.00099993017408698,
"loss": 4.2619,
"step": 313
},
{
"epoch": 0.04,
"grad_norm": 1.562135665499809,
"learning_rate": 0.0009999270363074547,
"loss": 4.2468,
"step": 314
},
{
"epoch": 0.04,
"grad_norm": 1.9672135647360018,
"learning_rate": 0.0009999238295725237,
"loss": 4.2449,
"step": 315
},
{
"epoch": 0.04,
"grad_norm": 2.237390477781047,
"learning_rate": 0.00099992055388263,
"loss": 4.4515,
"step": 316
},
{
"epoch": 0.04,
"grad_norm": 1.7289822481385195,
"learning_rate": 0.0009999172092382252,
"loss": 4.4428,
"step": 317
},
{
"epoch": 0.04,
"grad_norm": 1.807361964324125,
"learning_rate": 0.0009999137956397707,
"loss": 4.2761,
"step": 318
},
{
"epoch": 0.04,
"grad_norm": 1.914227784366172,
"learning_rate": 0.0009999103130877373,
"loss": 4.3202,
"step": 319
},
{
"epoch": 0.04,
"grad_norm": 1.639973206740026,
"learning_rate": 0.0009999067615826054,
"loss": 4.4163,
"step": 320
},
{
"epoch": 0.04,
"grad_norm": 2.636950509972736,
"learning_rate": 0.000999903141124865,
"loss": 4.1639,
"step": 321
},
{
"epoch": 0.04,
"grad_norm": 2.92044342908847,
"learning_rate": 0.000999899451715015,
"loss": 4.2216,
"step": 322
},
{
"epoch": 0.04,
"grad_norm": 1.8259185649724194,
"learning_rate": 0.0009998956933535649,
"loss": 4.5109,
"step": 323
},
{
"epoch": 0.04,
"grad_norm": 4.089977847097526,
"learning_rate": 0.0009998918660410324,
"loss": 4.2523,
"step": 324
},
{
"epoch": 0.04,
"grad_norm": 2.035188344176966,
"learning_rate": 0.000999887969777946,
"loss": 4.4065,
"step": 325
},
{
"epoch": 0.04,
"grad_norm": 2.290073152608485,
"learning_rate": 0.000999884004564843,
"loss": 4.3425,
"step": 326
},
{
"epoch": 0.04,
"grad_norm": 1.9330517104682972,
"learning_rate": 0.00099987997040227,
"loss": 4.1753,
"step": 327
},
{
"epoch": 0.04,
"grad_norm": 11.357390832457739,
"learning_rate": 0.0009998758672907838,
"loss": 4.2634,
"step": 328
},
{
"epoch": 0.04,
"grad_norm": 2.9448695204114603,
"learning_rate": 0.0009998716952309501,
"loss": 4.3565,
"step": 329
},
{
"epoch": 0.04,
"grad_norm": 1.6938183308028358,
"learning_rate": 0.0009998674542233445,
"loss": 4.1042,
"step": 330
},
{
"epoch": 0.04,
"grad_norm": 1.710763084670905,
"learning_rate": 0.000999863144268552,
"loss": 4.4057,
"step": 331
},
{
"epoch": 0.04,
"grad_norm": 3.48367778651573,
"learning_rate": 0.000999858765367167,
"loss": 4.1762,
"step": 332
},
{
"epoch": 0.04,
"grad_norm": 2.771254612675359,
"learning_rate": 0.0009998543175197936,
"loss": 4.2629,
"step": 333
},
{
"epoch": 0.04,
"grad_norm": 2.890136265607465,
"learning_rate": 0.000999849800727045,
"loss": 4.3958,
"step": 334
},
{
"epoch": 0.04,
"grad_norm": 2.0315577935707196,
"learning_rate": 0.0009998452149895445,
"loss": 4.2555,
"step": 335
},
{
"epoch": 0.04,
"grad_norm": 4.67655098266027,
"learning_rate": 0.0009998405603079243,
"loss": 4.3867,
"step": 336
},
{
"epoch": 0.04,
"grad_norm": 2.2075607062272584,
"learning_rate": 0.0009998358366828269,
"loss": 4.3525,
"step": 337
},
{
"epoch": 0.04,
"grad_norm": 2.21224444669047,
"learning_rate": 0.0009998310441149034,
"loss": 4.4316,
"step": 338
},
{
"epoch": 0.04,
"grad_norm": 2.9320424085076833,
"learning_rate": 0.000999826182604815,
"loss": 4.353,
"step": 339
},
{
"epoch": 0.04,
"grad_norm": 1.8362671580047158,
"learning_rate": 0.0009998212521532325,
"loss": 4.2803,
"step": 340
},
{
"epoch": 0.04,
"grad_norm": 1.824304176840615,
"learning_rate": 0.0009998162527608354,
"loss": 4.2204,
"step": 341
},
{
"epoch": 0.04,
"grad_norm": 1.9692547275793184,
"learning_rate": 0.0009998111844283137,
"loss": 4.4642,
"step": 342
},
{
"epoch": 0.04,
"grad_norm": 1.9274740726597106,
"learning_rate": 0.0009998060471563665,
"loss": 4.4741,
"step": 343
},
{
"epoch": 0.04,
"grad_norm": 1.9171628146499389,
"learning_rate": 0.0009998008409457023,
"loss": 4.3747,
"step": 344
},
{
"epoch": 0.04,
"grad_norm": 1.8141376277252172,
"learning_rate": 0.000999795565797039,
"loss": 4.0678,
"step": 345
},
{
"epoch": 0.04,
"grad_norm": 2.4331340166834967,
"learning_rate": 0.0009997902217111045,
"loss": 4.1271,
"step": 346
},
{
"epoch": 0.04,
"grad_norm": 2.8169361835673388,
"learning_rate": 0.0009997848086886357,
"loss": 4.3644,
"step": 347
},
{
"epoch": 0.04,
"grad_norm": 1.5517213596259445,
"learning_rate": 0.0009997793267303792,
"loss": 4.2935,
"step": 348
},
{
"epoch": 0.04,
"grad_norm": 1.731354291279978,
"learning_rate": 0.0009997737758370914,
"loss": 4.3753,
"step": 349
},
{
"epoch": 0.04,
"grad_norm": 2.9214099135868503,
"learning_rate": 0.0009997681560095378,
"loss": 4.2818,
"step": 350
},
{
"epoch": 0.04,
"grad_norm": 2.3168053361047427,
"learning_rate": 0.0009997624672484933,
"loss": 4.243,
"step": 351
},
{
"epoch": 0.04,
"grad_norm": 2.608830425511979,
"learning_rate": 0.0009997567095547432,
"loss": 4.3503,
"step": 352
},
{
"epoch": 0.04,
"grad_norm": 1.5085659676460148,
"learning_rate": 0.000999750882929081,
"loss": 4.4028,
"step": 353
},
{
"epoch": 0.04,
"grad_norm": 3.3475479897967886,
"learning_rate": 0.0009997449873723105,
"loss": 4.3183,
"step": 354
},
{
"epoch": 0.04,
"grad_norm": 3.506348953965699,
"learning_rate": 0.000999739022885245,
"loss": 4.3996,
"step": 355
},
{
"epoch": 0.04,
"grad_norm": 2.174349927085039,
"learning_rate": 0.0009997329894687072,
"loss": 4.2434,
"step": 356
},
{
"epoch": 0.04,
"grad_norm": 5.1077458540111875,
"learning_rate": 0.0009997268871235296,
"loss": 4.3555,
"step": 357
},
{
"epoch": 0.04,
"grad_norm": 2.029820315403982,
"learning_rate": 0.0009997207158505533,
"loss": 4.3735,
"step": 358
},
{
"epoch": 0.04,
"grad_norm": 2.0327017578579363,
"learning_rate": 0.0009997144756506298,
"loss": 4.4029,
"step": 359
},
{
"epoch": 0.04,
"grad_norm": 1.4527800941913545,
"learning_rate": 0.00099970816652462,
"loss": 4.2341,
"step": 360
},
{
"epoch": 0.04,
"grad_norm": 1.860703957721792,
"learning_rate": 0.0009997017884733938,
"loss": 4.2853,
"step": 361
},
{
"epoch": 0.04,
"grad_norm": 2.0322573868389124,
"learning_rate": 0.000999695341497831,
"loss": 4.1132,
"step": 362
},
{
"epoch": 0.04,
"grad_norm": 3.8151951596454006,
"learning_rate": 0.0009996888255988207,
"loss": 4.264,
"step": 363
},
{
"epoch": 0.04,
"grad_norm": 1.4077202309359398,
"learning_rate": 0.0009996822407772623,
"loss": 4.2707,
"step": 364
},
{
"epoch": 0.04,
"grad_norm": 2.392880519936668,
"learning_rate": 0.0009996755870340633,
"loss": 4.2809,
"step": 365
},
{
"epoch": 0.04,
"grad_norm": 4.222587831656337,
"learning_rate": 0.0009996688643701419,
"loss": 4.3933,
"step": 366
},
{
"epoch": 0.04,
"grad_norm": 1.607045817504694,
"learning_rate": 0.0009996620727864252,
"loss": 4.3359,
"step": 367
},
{
"epoch": 0.04,
"grad_norm": 1.9762970613273254,
"learning_rate": 0.00099965521228385,
"loss": 4.132,
"step": 368
},
{
"epoch": 0.04,
"grad_norm": 2.4482652699405167,
"learning_rate": 0.0009996482828633624,
"loss": 4.4534,
"step": 369
},
{
"epoch": 0.04,
"grad_norm": 3.9466657139702224,
"learning_rate": 0.0009996412845259183,
"loss": 4.3283,
"step": 370
},
{
"epoch": 0.04,
"grad_norm": 2.3363606930209673,
"learning_rate": 0.0009996342172724833,
"loss": 4.3269,
"step": 371
},
{
"epoch": 0.04,
"grad_norm": 1.555438533816772,
"learning_rate": 0.0009996270811040318,
"loss": 4.1546,
"step": 372
},
{
"epoch": 0.04,
"grad_norm": 1.3799450891693201,
"learning_rate": 0.0009996198760215483,
"loss": 4.0072,
"step": 373
},
{
"epoch": 0.04,
"grad_norm": 2.789227331604965,
"learning_rate": 0.0009996126020260262,
"loss": 4.3284,
"step": 374
},
{
"epoch": 0.04,
"grad_norm": 5.995065064397545,
"learning_rate": 0.0009996052591184695,
"loss": 4.3202,
"step": 375
},
{
"epoch": 0.04,
"grad_norm": 3.1329861418074287,
"learning_rate": 0.0009995978472998905,
"loss": 4.3774,
"step": 376
},
{
"epoch": 0.04,
"grad_norm": 1.8037810973037383,
"learning_rate": 0.0009995903665713118,
"loss": 4.3896,
"step": 377
},
{
"epoch": 0.04,
"grad_norm": 2.5156038836115218,
"learning_rate": 0.000999582816933765,
"loss": 4.1237,
"step": 378
},
{
"epoch": 0.04,
"grad_norm": 2.4809106832995123,
"learning_rate": 0.0009995751983882914,
"loss": 4.3039,
"step": 379
},
{
"epoch": 0.04,
"grad_norm": 1.50925999240428,
"learning_rate": 0.000999567510935942,
"loss": 4.4198,
"step": 380
},
{
"epoch": 0.04,
"grad_norm": 1.8919292954802003,
"learning_rate": 0.0009995597545777771,
"loss": 4.3502,
"step": 381
},
{
"epoch": 0.04,
"grad_norm": 1.4631282964061914,
"learning_rate": 0.0009995519293148666,
"loss": 4.2612,
"step": 382
},
{
"epoch": 0.04,
"grad_norm": 2.7030133937626757,
"learning_rate": 0.0009995440351482897,
"loss": 4.4067,
"step": 383
},
{
"epoch": 0.04,
"grad_norm": 1.9711718644711285,
"learning_rate": 0.0009995360720791353,
"loss": 4.4199,
"step": 384
},
{
"epoch": 0.04,
"grad_norm": 2.054539931349483,
"learning_rate": 0.000999528040108502,
"loss": 4.3721,
"step": 385
},
{
"epoch": 0.04,
"grad_norm": 1.703097984938856,
"learning_rate": 0.0009995199392374972,
"loss": 4.4162,
"step": 386
},
{
"epoch": 0.04,
"grad_norm": 2.0142157133427907,
"learning_rate": 0.0009995117694672386,
"loss": 4.3357,
"step": 387
},
{
"epoch": 0.04,
"grad_norm": 1.8872273699246864,
"learning_rate": 0.000999503530798853,
"loss": 4.3758,
"step": 388
},
{
"epoch": 0.04,
"grad_norm": 2.711382463360124,
"learning_rate": 0.0009994952232334766,
"loss": 4.4434,
"step": 389
},
{
"epoch": 0.04,
"grad_norm": 2.7816815660153114,
"learning_rate": 0.0009994868467722556,
"loss": 4.2981,
"step": 390
},
{
"epoch": 0.04,
"grad_norm": 1.874592225183986,
"learning_rate": 0.0009994784014163449,
"loss": 4.2146,
"step": 391
},
{
"epoch": 0.04,
"grad_norm": 1.653423412607284,
"learning_rate": 0.0009994698871669098,
"loss": 4.4266,
"step": 392
},
{
"epoch": 0.05,
"grad_norm": 1.715946786925207,
"learning_rate": 0.0009994613040251246,
"loss": 4.3713,
"step": 393
},
{
"epoch": 0.05,
"grad_norm": 1.7543716989747122,
"learning_rate": 0.000999452651992173,
"loss": 4.4658,
"step": 394
},
{
"epoch": 0.05,
"grad_norm": 4.817152810934617,
"learning_rate": 0.0009994439310692486,
"loss": 4.0528,
"step": 395
},
{
"epoch": 0.05,
"grad_norm": 2.7384509728442397,
"learning_rate": 0.0009994351412575542,
"loss": 4.1503,
"step": 396
},
{
"epoch": 0.05,
"grad_norm": 2.954948826171897,
"learning_rate": 0.000999426282558302,
"loss": 4.335,
"step": 397
},
{
"epoch": 0.05,
"grad_norm": 2.305346089693677,
"learning_rate": 0.000999417354972714,
"loss": 4.1482,
"step": 398
},
{
"epoch": 0.05,
"grad_norm": 2.0779507356623155,
"learning_rate": 0.000999408358502022,
"loss": 4.2645,
"step": 399
},
{
"epoch": 0.05,
"grad_norm": 3.3992067111514417,
"learning_rate": 0.0009993992931474661,
"loss": 4.4103,
"step": 400
},
{
"epoch": 0.05,
"grad_norm": 1.8649210842050046,
"learning_rate": 0.0009993901589102974,
"loss": 4.4022,
"step": 401
},
{
"epoch": 0.05,
"grad_norm": 1.8994789460454895,
"learning_rate": 0.0009993809557917754,
"loss": 4.2069,
"step": 402
},
{
"epoch": 0.05,
"grad_norm": 4.7193723609231,
"learning_rate": 0.0009993716837931696,
"loss": 4.3625,
"step": 403
},
{
"epoch": 0.05,
"grad_norm": 3.0296104523406804,
"learning_rate": 0.000999362342915759,
"loss": 4.3738,
"step": 404
},
{
"epoch": 0.05,
"grad_norm": 2.5838482333449595,
"learning_rate": 0.0009993529331608318,
"loss": 4.3047,
"step": 405
},
{
"epoch": 0.05,
"grad_norm": 1.4792352344402693,
"learning_rate": 0.0009993434545296862,
"loss": 4.1021,
"step": 406
},
{
"epoch": 0.05,
"grad_norm": 3.167619525955671,
"learning_rate": 0.0009993339070236292,
"loss": 4.1586,
"step": 407
},
{
"epoch": 0.05,
"grad_norm": 1.6018565716773459,
"learning_rate": 0.000999324290643978,
"loss": 4.0348,
"step": 408
},
{
"epoch": 0.05,
"grad_norm": 1.427542081196093,
"learning_rate": 0.0009993146053920588,
"loss": 3.9817,
"step": 409
},
{
"epoch": 0.05,
"grad_norm": 1.4610839463941518,
"learning_rate": 0.0009993048512692078,
"loss": 3.9734,
"step": 410
},
{
"epoch": 0.05,
"grad_norm": 1.8807297775651135,
"learning_rate": 0.00099929502827677,
"loss": 4.3116,
"step": 411
},
{
"epoch": 0.05,
"grad_norm": 1.916794844204584,
"learning_rate": 0.0009992851364161006,
"loss": 4.3492,
"step": 412
},
{
"epoch": 0.05,
"grad_norm": 1.6042535763809467,
"learning_rate": 0.0009992751756885637,
"loss": 4.2904,
"step": 413
},
{
"epoch": 0.05,
"grad_norm": 3.2432575271957913,
"learning_rate": 0.0009992651460955335,
"loss": 4.4161,
"step": 414
},
{
"epoch": 0.05,
"grad_norm": 1.6664656283005956,
"learning_rate": 0.0009992550476383931,
"loss": 4.1874,
"step": 415
},
{
"epoch": 0.05,
"grad_norm": 2.399227905333266,
"learning_rate": 0.0009992448803185356,
"loss": 4.2845,
"step": 416
},
{
"epoch": 0.05,
"grad_norm": 1.8837521453313297,
"learning_rate": 0.0009992346441373633,
"loss": 4.0753,
"step": 417
},
{
"epoch": 0.05,
"grad_norm": 1.6459876908246365,
"learning_rate": 0.0009992243390962883,
"loss": 4.3527,
"step": 418
},
{
"epoch": 0.05,
"grad_norm": 1.3935154843005806,
"learning_rate": 0.0009992139651967319,
"loss": 4.2882,
"step": 419
},
{
"epoch": 0.05,
"grad_norm": 1.4413558791410421,
"learning_rate": 0.0009992035224401245,
"loss": 4.1771,
"step": 420
},
{
"epoch": 0.05,
"grad_norm": 1.7337219904992842,
"learning_rate": 0.0009991930108279074,
"loss": 4.2516,
"step": 421
},
{
"epoch": 0.05,
"grad_norm": 1.5745527404436643,
"learning_rate": 0.0009991824303615293,
"loss": 4.2974,
"step": 422
},
{
"epoch": 0.05,
"grad_norm": 1.7758463076457154,
"learning_rate": 0.0009991717810424506,
"loss": 4.247,
"step": 423
},
{
"epoch": 0.05,
"grad_norm": 1.6631677527762334,
"learning_rate": 0.0009991610628721397,
"loss": 4.4106,
"step": 424
},
{
"epoch": 0.05,
"grad_norm": 1.8672230363361033,
"learning_rate": 0.000999150275852075,
"loss": 4.266,
"step": 425
},
{
"epoch": 0.05,
"grad_norm": 1.9090522524430469,
"learning_rate": 0.0009991394199837444,
"loss": 4.1662,
"step": 426
},
{
"epoch": 0.05,
"grad_norm": 1.91133916486316,
"learning_rate": 0.0009991284952686455,
"loss": 3.9843,
"step": 427
},
{
"epoch": 0.05,
"grad_norm": 1.6146778810250457,
"learning_rate": 0.0009991175017082848,
"loss": 4.0937,
"step": 428
},
{
"epoch": 0.05,
"grad_norm": 2.200513031755401,
"learning_rate": 0.0009991064393041786,
"loss": 4.1786,
"step": 429
},
{
"epoch": 0.05,
"grad_norm": 1.588028511494425,
"learning_rate": 0.0009990953080578533,
"loss": 4.2679,
"step": 430
},
{
"epoch": 0.05,
"grad_norm": 1.8832431580890614,
"learning_rate": 0.0009990841079708435,
"loss": 4.1996,
"step": 431
},
{
"epoch": 0.05,
"grad_norm": 2.938964510419675,
"learning_rate": 0.0009990728390446946,
"loss": 4.0794,
"step": 432
},
{
"epoch": 0.05,
"grad_norm": 2.352905728010987,
"learning_rate": 0.0009990615012809608,
"loss": 4.0449,
"step": 433
},
{
"epoch": 0.05,
"grad_norm": 1.5626585709764087,
"learning_rate": 0.0009990500946812058,
"loss": 4.4967,
"step": 434
},
{
"epoch": 0.05,
"grad_norm": 2.750916997679828,
"learning_rate": 0.000999038619247003,
"loss": 4.2462,
"step": 435
},
{
"epoch": 0.05,
"grad_norm": 1.5449652538799465,
"learning_rate": 0.0009990270749799352,
"loss": 4.3279,
"step": 436
},
{
"epoch": 0.05,
"grad_norm": 1.9330843091419507,
"learning_rate": 0.0009990154618815948,
"loss": 4.0706,
"step": 437
},
{
"epoch": 0.05,
"grad_norm": 2.1510611600876746,
"learning_rate": 0.0009990037799535833,
"loss": 4.4807,
"step": 438
},
{
"epoch": 0.05,
"grad_norm": 2.612616012426449,
"learning_rate": 0.0009989920291975124,
"loss": 4.2772,
"step": 439
},
{
"epoch": 0.05,
"grad_norm": 1.5717237611199786,
"learning_rate": 0.0009989802096150029,
"loss": 4.3891,
"step": 440
},
{
"epoch": 0.05,
"grad_norm": 1.577107474934209,
"learning_rate": 0.0009989683212076848,
"loss": 4.1637,
"step": 441
},
{
"epoch": 0.05,
"grad_norm": 1.6478300244580233,
"learning_rate": 0.0009989563639771978,
"loss": 4.2522,
"step": 442
},
{
"epoch": 0.05,
"grad_norm": 1.6927587430572832,
"learning_rate": 0.0009989443379251916,
"loss": 4.3065,
"step": 443
},
{
"epoch": 0.05,
"grad_norm": 2.8891996311556603,
"learning_rate": 0.0009989322430533245,
"loss": 4.4178,
"step": 444
},
{
"epoch": 0.05,
"grad_norm": 1.8081540980021196,
"learning_rate": 0.0009989200793632652,
"loss": 4.1146,
"step": 445
},
{
"epoch": 0.05,
"grad_norm": 1.7554401129243336,
"learning_rate": 0.0009989078468566912,
"loss": 4.2285,
"step": 446
},
{
"epoch": 0.05,
"grad_norm": 1.8601166131951627,
"learning_rate": 0.0009988955455352898,
"loss": 4.313,
"step": 447
},
{
"epoch": 0.05,
"grad_norm": 1.3043767220073055,
"learning_rate": 0.0009988831754007576,
"loss": 4.2489,
"step": 448
},
{
"epoch": 0.05,
"grad_norm": 1.3446798904621964,
"learning_rate": 0.000998870736454801,
"loss": 4.0232,
"step": 449
},
{
"epoch": 0.05,
"grad_norm": 1.5138647439544766,
"learning_rate": 0.0009988582286991356,
"loss": 4.2073,
"step": 450
},
{
"epoch": 0.05,
"grad_norm": 4.3473933009612535,
"learning_rate": 0.0009988456521354868,
"loss": 4.1479,
"step": 451
},
{
"epoch": 0.05,
"grad_norm": 1.7081683867716282,
"learning_rate": 0.000998833006765589,
"loss": 4.1977,
"step": 452
},
{
"epoch": 0.05,
"grad_norm": 2.008695733739199,
"learning_rate": 0.0009988202925911864,
"loss": 4.3076,
"step": 453
},
{
"epoch": 0.05,
"grad_norm": 2.226676390416783,
"learning_rate": 0.000998807509614033,
"loss": 4.3455,
"step": 454
},
{
"epoch": 0.05,
"grad_norm": 2.4529790355120613,
"learning_rate": 0.0009987946578358918,
"loss": 4.2358,
"step": 455
},
{
"epoch": 0.05,
"grad_norm": 1.702687405469556,
"learning_rate": 0.0009987817372585355,
"loss": 4.0999,
"step": 456
},
{
"epoch": 0.05,
"grad_norm": 1.6464839082624225,
"learning_rate": 0.000998768747883746,
"loss": 4.1645,
"step": 457
},
{
"epoch": 0.05,
"grad_norm": 1.8219406274361665,
"learning_rate": 0.0009987556897133151,
"loss": 4.2115,
"step": 458
},
{
"epoch": 0.05,
"grad_norm": 2.0047366208483584,
"learning_rate": 0.0009987425627490441,
"loss": 4.3685,
"step": 459
},
{
"epoch": 0.05,
"grad_norm": 1.6433088845060244,
"learning_rate": 0.0009987293669927436,
"loss": 4.1339,
"step": 460
},
{
"epoch": 0.05,
"grad_norm": 1.7008352746961095,
"learning_rate": 0.0009987161024462333,
"loss": 4.4192,
"step": 461
},
{
"epoch": 0.05,
"grad_norm": 3.81429443488762,
"learning_rate": 0.0009987027691113432,
"loss": 4.3099,
"step": 462
},
{
"epoch": 0.05,
"grad_norm": 2.957255037549792,
"learning_rate": 0.0009986893669899123,
"loss": 4.0336,
"step": 463
},
{
"epoch": 0.05,
"grad_norm": 1.4487172164503406,
"learning_rate": 0.0009986758960837889,
"loss": 4.334,
"step": 464
},
{
"epoch": 0.05,
"grad_norm": 21.030796056337785,
"learning_rate": 0.0009986623563948314,
"loss": 4.2902,
"step": 465
},
{
"epoch": 0.05,
"grad_norm": 1.3780642257801548,
"learning_rate": 0.000998648747924907,
"loss": 4.3129,
"step": 466
},
{
"epoch": 0.05,
"grad_norm": 1.705575202553169,
"learning_rate": 0.0009986350706758934,
"loss": 4.3348,
"step": 467
},
{
"epoch": 0.05,
"grad_norm": 2.084448408644449,
"learning_rate": 0.0009986213246496762,
"loss": 4.3745,
"step": 468
},
{
"epoch": 0.05,
"grad_norm": 2.1588504930194965,
"learning_rate": 0.000998607509848152,
"loss": 4.2281,
"step": 469
},
{
"epoch": 0.05,
"grad_norm": 1.7234762989631718,
"learning_rate": 0.0009985936262732263,
"loss": 4.2508,
"step": 470
},
{
"epoch": 0.05,
"grad_norm": 4.0707378126056994,
"learning_rate": 0.0009985796739268138,
"loss": 4.2954,
"step": 471
},
{
"epoch": 0.05,
"grad_norm": 1.800964140352813,
"learning_rate": 0.000998565652810839,
"loss": 3.9769,
"step": 472
},
{
"epoch": 0.05,
"grad_norm": 1.4446475932540723,
"learning_rate": 0.000998551562927236,
"loss": 4.3583,
"step": 473
},
{
"epoch": 0.05,
"grad_norm": 2.523896449216337,
"learning_rate": 0.000998537404277948,
"loss": 4.1434,
"step": 474
},
{
"epoch": 0.05,
"grad_norm": 1.9069530193850508,
"learning_rate": 0.0009985231768649284,
"loss": 4.195,
"step": 475
},
{
"epoch": 0.05,
"grad_norm": 1.5923551803269194,
"learning_rate": 0.000998508880690139,
"loss": 4.3102,
"step": 476
},
{
"epoch": 0.05,
"grad_norm": 1.3656745062183016,
"learning_rate": 0.000998494515755552,
"loss": 4.1989,
"step": 477
},
{
"epoch": 0.05,
"grad_norm": 1.9344111813161828,
"learning_rate": 0.0009984800820631488,
"loss": 4.3079,
"step": 478
},
{
"epoch": 0.05,
"grad_norm": 1.8460016090634654,
"learning_rate": 0.0009984655796149201,
"loss": 4.3253,
"step": 479
},
{
"epoch": 0.06,
"grad_norm": 1.4443807170439855,
"learning_rate": 0.0009984510084128661,
"loss": 4.2087,
"step": 480
},
{
"epoch": 0.06,
"grad_norm": 2.504351563017082,
"learning_rate": 0.0009984363684589972,
"loss": 4.1932,
"step": 481
},
{
"epoch": 0.06,
"grad_norm": 1.875473736895432,
"learning_rate": 0.0009984216597553322,
"loss": 4.2492,
"step": 482
},
{
"epoch": 0.06,
"grad_norm": 1.782924005727873,
"learning_rate": 0.0009984068823039,
"loss": 4.2634,
"step": 483
},
{
"epoch": 0.06,
"grad_norm": 1.7377419834196606,
"learning_rate": 0.0009983920361067388,
"loss": 4.1939,
"step": 484
},
{
"epoch": 0.06,
"grad_norm": 1.49780906534707,
"learning_rate": 0.0009983771211658965,
"loss": 4.2586,
"step": 485
},
{
"epoch": 0.06,
"grad_norm": 5.212474395281273,
"learning_rate": 0.0009983621374834303,
"loss": 4.2255,
"step": 486
},
{
"epoch": 0.06,
"grad_norm": 1.3335553929577793,
"learning_rate": 0.0009983470850614068,
"loss": 4.0619,
"step": 487
},
{
"epoch": 0.06,
"grad_norm": 1.7155363566328607,
"learning_rate": 0.0009983319639019024,
"loss": 4.1229,
"step": 488
},
{
"epoch": 0.06,
"grad_norm": 1.78815904460192,
"learning_rate": 0.0009983167740070025,
"loss": 4.2626,
"step": 489
},
{
"epoch": 0.06,
"grad_norm": 4.786593652576243,
"learning_rate": 0.0009983015153788026,
"loss": 4.1652,
"step": 490
},
{
"epoch": 0.06,
"grad_norm": 2.03885457511657,
"learning_rate": 0.000998286188019407,
"loss": 4.2851,
"step": 491
},
{
"epoch": 0.06,
"grad_norm": 1.912137282293859,
"learning_rate": 0.00099827079193093,
"loss": 4.2247,
"step": 492
},
{
"epoch": 0.06,
"grad_norm": 1.5207324767271615,
"learning_rate": 0.0009982553271154953,
"loss": 4.1751,
"step": 493
},
{
"epoch": 0.06,
"grad_norm": 1.594172759661226,
"learning_rate": 0.0009982397935752356,
"loss": 4.1073,
"step": 494
},
{
"epoch": 0.06,
"grad_norm": 1.4192927756443627,
"learning_rate": 0.0009982241913122937,
"loss": 4.0173,
"step": 495
},
{
"epoch": 0.06,
"grad_norm": 1.5622376954998693,
"learning_rate": 0.000998208520328822,
"loss": 4.4546,
"step": 496
},
{
"epoch": 0.06,
"grad_norm": 1.7975572398456645,
"learning_rate": 0.0009981927806269812,
"loss": 4.0506,
"step": 497
},
{
"epoch": 0.06,
"grad_norm": 1.586452293418299,
"learning_rate": 0.0009981769722089428,
"loss": 4.3771,
"step": 498
},
{
"epoch": 0.06,
"grad_norm": 2.507141545274864,
"learning_rate": 0.0009981610950768873,
"loss": 4.0819,
"step": 499
},
{
"epoch": 0.06,
"grad_norm": 1.589922120273323,
"learning_rate": 0.0009981451492330046,
"loss": 4.3237,
"step": 500
},
{
"epoch": 0.06,
"grad_norm": 1.5099246398548452,
"learning_rate": 0.000998129134679494,
"loss": 4.2263,
"step": 501
},
{
"epoch": 0.06,
"grad_norm": 1.5278854374975164,
"learning_rate": 0.0009981130514185646,
"loss": 4.1423,
"step": 502
},
{
"epoch": 0.06,
"grad_norm": 2.071764368335239,
"learning_rate": 0.0009980968994524344,
"loss": 4.1247,
"step": 503
},
{
"epoch": 0.06,
"grad_norm": 4.360920871701456,
"learning_rate": 0.0009980806787833316,
"loss": 4.3393,
"step": 504
},
{
"epoch": 0.06,
"grad_norm": 1.4479058911134457,
"learning_rate": 0.0009980643894134935,
"loss": 4.2023,
"step": 505
},
{
"epoch": 0.06,
"grad_norm": 1.4282311467477264,
"learning_rate": 0.000998048031345167,
"loss": 4.1102,
"step": 506
},
{
"epoch": 0.06,
"grad_norm": 1.6012850105837713,
"learning_rate": 0.0009980316045806082,
"loss": 4.0026,
"step": 507
},
{
"epoch": 0.06,
"grad_norm": 1.5234065973771034,
"learning_rate": 0.0009980151091220826,
"loss": 4.1014,
"step": 508
},
{
"epoch": 0.06,
"grad_norm": 1.7079381420146302,
"learning_rate": 0.000997998544971866,
"loss": 4.4228,
"step": 509
},
{
"epoch": 0.06,
"grad_norm": 3.0879048988915425,
"learning_rate": 0.0009979819121322426,
"loss": 4.0952,
"step": 510
},
{
"epoch": 0.06,
"grad_norm": 1.2534923658055779,
"learning_rate": 0.000997965210605507,
"loss": 4.2086,
"step": 511
},
{
"epoch": 0.06,
"grad_norm": 2.3904070210883575,
"learning_rate": 0.0009979484403939626,
"loss": 3.9227,
"step": 512
},
{
"epoch": 0.06,
"grad_norm": 2.2013550010173057,
"learning_rate": 0.0009979316014999226,
"loss": 4.0698,
"step": 513
},
{
"epoch": 0.06,
"grad_norm": 1.874971330992358,
"learning_rate": 0.0009979146939257098,
"loss": 4.1274,
"step": 514
},
{
"epoch": 0.06,
"grad_norm": 1.2884000658407369,
"learning_rate": 0.000997897717673656,
"loss": 4.0722,
"step": 515
},
{
"epoch": 0.06,
"grad_norm": 1.4247112824547676,
"learning_rate": 0.0009978806727461028,
"loss": 4.1469,
"step": 516
},
{
"epoch": 0.06,
"grad_norm": 1.8338444652621828,
"learning_rate": 0.000997863559145401,
"loss": 4.2789,
"step": 517
},
{
"epoch": 0.06,
"grad_norm": 2.585322298329817,
"learning_rate": 0.0009978463768739118,
"loss": 4.1187,
"step": 518
},
{
"epoch": 0.06,
"grad_norm": 1.451768166506986,
"learning_rate": 0.0009978291259340045,
"loss": 3.8886,
"step": 519
},
{
"epoch": 0.06,
"grad_norm": 2.2135602121699254,
"learning_rate": 0.0009978118063280587,
"loss": 4.2749,
"step": 520
},
{
"epoch": 0.06,
"grad_norm": 7.7690976475861095,
"learning_rate": 0.0009977944180584637,
"loss": 3.9638,
"step": 521
},
{
"epoch": 0.06,
"grad_norm": 3.595326296207757,
"learning_rate": 0.0009977769611276173,
"loss": 4.134,
"step": 522
},
{
"epoch": 0.06,
"grad_norm": 2.1456430341295403,
"learning_rate": 0.0009977594355379275,
"loss": 4.0029,
"step": 523
},
{
"epoch": 0.06,
"grad_norm": 1.4701388696275608,
"learning_rate": 0.000997741841291812,
"loss": 4.1146,
"step": 524
},
{
"epoch": 0.06,
"grad_norm": 2.599532997688662,
"learning_rate": 0.000997724178391697,
"loss": 4.307,
"step": 525
},
{
"epoch": 0.06,
"grad_norm": 1.5914841650355807,
"learning_rate": 0.0009977064468400193,
"loss": 4.2285,
"step": 526
},
{
"epoch": 0.06,
"grad_norm": 2.20154652964313,
"learning_rate": 0.0009976886466392244,
"loss": 4.2076,
"step": 527
},
{
"epoch": 0.06,
"grad_norm": 2.8448031908182587,
"learning_rate": 0.0009976707777917676,
"loss": 4.086,
"step": 528
},
{
"epoch": 0.06,
"grad_norm": 2.333745987165948,
"learning_rate": 0.0009976528403001133,
"loss": 3.9884,
"step": 529
},
{
"epoch": 0.06,
"grad_norm": 1.9260832400646073,
"learning_rate": 0.0009976348341667358,
"loss": 4.2554,
"step": 530
},
{
"epoch": 0.06,
"grad_norm": 1.279482443540274,
"learning_rate": 0.0009976167593941188,
"loss": 4.3154,
"step": 531
},
{
"epoch": 0.06,
"grad_norm": 5.483665194492572,
"learning_rate": 0.000997598615984755,
"loss": 4.1684,
"step": 532
},
{
"epoch": 0.06,
"grad_norm": 1.4700802465594387,
"learning_rate": 0.0009975804039411475,
"loss": 4.2683,
"step": 533
},
{
"epoch": 0.06,
"grad_norm": 1.556617514531295,
"learning_rate": 0.0009975621232658082,
"loss": 4.2066,
"step": 534
},
{
"epoch": 0.06,
"grad_norm": 5.938519899910758,
"learning_rate": 0.000997543773961258,
"loss": 4.2524,
"step": 535
},
{
"epoch": 0.06,
"grad_norm": 1.2585559541296696,
"learning_rate": 0.0009975253560300283,
"loss": 3.9952,
"step": 536
},
{
"epoch": 0.06,
"grad_norm": 1.6792312392559359,
"learning_rate": 0.0009975068694746596,
"loss": 4.2522,
"step": 537
},
{
"epoch": 0.06,
"grad_norm": 1.6587467590262839,
"learning_rate": 0.0009974883142977015,
"loss": 4.3997,
"step": 538
},
{
"epoch": 0.06,
"grad_norm": 2.3202724469002165,
"learning_rate": 0.0009974696905017135,
"loss": 4.0084,
"step": 539
},
{
"epoch": 0.06,
"grad_norm": 2.4828249324301743,
"learning_rate": 0.0009974509980892642,
"loss": 4.1457,
"step": 540
},
{
"epoch": 0.06,
"grad_norm": 1.6082393164446473,
"learning_rate": 0.0009974322370629321,
"loss": 4.2472,
"step": 541
},
{
"epoch": 0.06,
"grad_norm": 1.6290195202216855,
"learning_rate": 0.000997413407425305,
"loss": 4.1145,
"step": 542
},
{
"epoch": 0.06,
"grad_norm": 1.523104617602103,
"learning_rate": 0.0009973945091789796,
"loss": 4.244,
"step": 543
},
{
"epoch": 0.06,
"grad_norm": 1.5844748833215803,
"learning_rate": 0.000997375542326563,
"loss": 4.1834,
"step": 544
},
{
"epoch": 0.06,
"grad_norm": 1.669913922464535,
"learning_rate": 0.0009973565068706711,
"loss": 3.9686,
"step": 545
},
{
"epoch": 0.06,
"grad_norm": 1.434144360816346,
"learning_rate": 0.0009973374028139296,
"loss": 4.1261,
"step": 546
},
{
"epoch": 0.06,
"grad_norm": 1.9903829897651333,
"learning_rate": 0.0009973182301589736,
"loss": 4.2357,
"step": 547
},
{
"epoch": 0.06,
"grad_norm": 1.3247511726465688,
"learning_rate": 0.0009972989889084473,
"loss": 4.0462,
"step": 548
},
{
"epoch": 0.06,
"grad_norm": 2.064275838901619,
"learning_rate": 0.000997279679065005,
"loss": 4.1727,
"step": 549
},
{
"epoch": 0.06,
"grad_norm": 3.6651841984017866,
"learning_rate": 0.0009972603006313098,
"loss": 4.35,
"step": 550
},
{
"epoch": 0.06,
"grad_norm": 2.0187157158460702,
"learning_rate": 0.000997240853610035,
"loss": 4.1029,
"step": 551
},
{
"epoch": 0.06,
"grad_norm": 1.4129647219336012,
"learning_rate": 0.0009972213380038627,
"loss": 4.0784,
"step": 552
},
{
"epoch": 0.06,
"grad_norm": 1.4235750312775746,
"learning_rate": 0.0009972017538154845,
"loss": 4.2098,
"step": 553
},
{
"epoch": 0.06,
"grad_norm": 1.7876370331041684,
"learning_rate": 0.000997182101047602,
"loss": 3.9709,
"step": 554
},
{
"epoch": 0.06,
"grad_norm": 1.8227111300640289,
"learning_rate": 0.0009971623797029258,
"loss": 4.0964,
"step": 555
},
{
"epoch": 0.06,
"grad_norm": 1.5843076877398035,
"learning_rate": 0.0009971425897841765,
"loss": 3.9849,
"step": 556
},
{
"epoch": 0.06,
"grad_norm": 2.132265876377265,
"learning_rate": 0.0009971227312940826,
"loss": 4.1936,
"step": 557
},
{
"epoch": 0.06,
"grad_norm": 1.6889622193517102,
"learning_rate": 0.0009971028042353844,
"loss": 4.1141,
"step": 558
},
{
"epoch": 0.06,
"grad_norm": 2.400131063705241,
"learning_rate": 0.00099708280861083,
"loss": 4.1785,
"step": 559
},
{
"epoch": 0.06,
"grad_norm": 2.9345285584843466,
"learning_rate": 0.0009970627444231776,
"loss": 4.1451,
"step": 560
},
{
"epoch": 0.06,
"grad_norm": 1.5176755910159163,
"learning_rate": 0.000997042611675194,
"loss": 3.8362,
"step": 561
},
{
"epoch": 0.06,
"grad_norm": 5.320823231381811,
"learning_rate": 0.0009970224103696568,
"loss": 4.0823,
"step": 562
},
{
"epoch": 0.06,
"grad_norm": 1.3897336441661774,
"learning_rate": 0.0009970021405093523,
"loss": 4.2072,
"step": 563
},
{
"epoch": 0.06,
"grad_norm": 3.4344538608757165,
"learning_rate": 0.0009969818020970761,
"loss": 3.993,
"step": 564
},
{
"epoch": 0.06,
"grad_norm": 1.9334547713916386,
"learning_rate": 0.0009969613951356338,
"loss": 4.1662,
"step": 565
},
{
"epoch": 0.06,
"grad_norm": 1.358149840169689,
"learning_rate": 0.0009969409196278398,
"loss": 4.0533,
"step": 566
},
{
"epoch": 0.07,
"grad_norm": 1.60757100887447,
"learning_rate": 0.0009969203755765186,
"loss": 4.2153,
"step": 567
},
{
"epoch": 0.07,
"grad_norm": 2.183863320900081,
"learning_rate": 0.0009968997629845038,
"loss": 4.1311,
"step": 568
},
{
"epoch": 0.07,
"grad_norm": 1.5471200211544884,
"learning_rate": 0.0009968790818546383,
"loss": 4.0602,
"step": 569
},
{
"epoch": 0.07,
"grad_norm": 1.9183073849495123,
"learning_rate": 0.000996858332189775,
"loss": 4.1157,
"step": 570
},
{
"epoch": 0.07,
"grad_norm": 1.894555648296673,
"learning_rate": 0.0009968375139927756,
"loss": 4.0403,
"step": 571
},
{
"epoch": 0.07,
"grad_norm": 1.476813880071536,
"learning_rate": 0.000996816627266512,
"loss": 4.4461,
"step": 572
},
{
"epoch": 0.07,
"grad_norm": 1.3078705247856606,
"learning_rate": 0.0009967956720138647,
"loss": 4.0242,
"step": 573
},
{
"epoch": 0.07,
"grad_norm": 1.4241452228625129,
"learning_rate": 0.0009967746482377243,
"loss": 4.1377,
"step": 574
},
{
"epoch": 0.07,
"grad_norm": 2.088606286674306,
"learning_rate": 0.0009967535559409905,
"loss": 4.1366,
"step": 575
},
{
"epoch": 0.07,
"grad_norm": 3.1183697030300097,
"learning_rate": 0.0009967323951265725,
"loss": 4.0718,
"step": 576
},
{
"epoch": 0.07,
"grad_norm": 2.8181021636795642,
"learning_rate": 0.0009967111657973892,
"loss": 3.9951,
"step": 577
},
{
"epoch": 0.07,
"grad_norm": 1.7985810211091366,
"learning_rate": 0.000996689867956369,
"loss": 4.1789,
"step": 578
},
{
"epoch": 0.07,
"grad_norm": 2.674205183704797,
"learning_rate": 0.0009966685016064491,
"loss": 4.264,
"step": 579
},
{
"epoch": 0.07,
"grad_norm": 1.433067428408238,
"learning_rate": 0.0009966470667505767,
"loss": 4.2296,
"step": 580
},
{
"epoch": 0.07,
"grad_norm": 1.9155876942205001,
"learning_rate": 0.0009966255633917086,
"loss": 4.1366,
"step": 581
},
{
"epoch": 0.07,
"grad_norm": 1.540915979669818,
"learning_rate": 0.0009966039915328105,
"loss": 4.2495,
"step": 582
},
{
"epoch": 0.07,
"grad_norm": 2.296715200614702,
"learning_rate": 0.0009965823511768578,
"loss": 4.1212,
"step": 583
},
{
"epoch": 0.07,
"grad_norm": 4.306543377455833,
"learning_rate": 0.0009965606423268355,
"loss": 4.2714,
"step": 584
},
{
"epoch": 0.07,
"grad_norm": 2.7273977277692967,
"learning_rate": 0.000996538864985738,
"loss": 4.2327,
"step": 585
},
{
"epoch": 0.07,
"grad_norm": 1.7551143785243801,
"learning_rate": 0.0009965170191565688,
"loss": 4.0823,
"step": 586
},
{
"epoch": 0.07,
"grad_norm": 1.893740375273181,
"learning_rate": 0.0009964951048423414,
"loss": 4.1585,
"step": 587
},
{
"epoch": 0.07,
"grad_norm": 1.9905641083956318,
"learning_rate": 0.0009964731220460784,
"loss": 4.1868,
"step": 588
},
{
"epoch": 0.07,
"grad_norm": 1.5449030665713914,
"learning_rate": 0.000996451070770812,
"loss": 4.2439,
"step": 589
},
{
"epoch": 0.07,
"grad_norm": 1.4462505162959032,
"learning_rate": 0.0009964289510195831,
"loss": 4.1768,
"step": 590
},
{
"epoch": 0.07,
"grad_norm": 1.5434352739544415,
"learning_rate": 0.0009964067627954436,
"loss": 4.1383,
"step": 591
},
{
"epoch": 0.07,
"grad_norm": 1.5851040024218337,
"learning_rate": 0.0009963845061014534,
"loss": 4.2143,
"step": 592
},
{
"epoch": 0.07,
"grad_norm": 2.9737392162317438,
"learning_rate": 0.0009963621809406826,
"loss": 4.3159,
"step": 593
},
{
"epoch": 0.07,
"grad_norm": 2.4643244274622575,
"learning_rate": 0.0009963397873162107,
"loss": 3.9931,
"step": 594
},
{
"epoch": 0.07,
"grad_norm": 1.7591523096500026,
"learning_rate": 0.0009963173252311257,
"loss": 4.4628,
"step": 595
},
{
"epoch": 0.07,
"grad_norm": 2.192205108599557,
"learning_rate": 0.0009962947946885268,
"loss": 4.0644,
"step": 596
},
{
"epoch": 0.07,
"grad_norm": 2.6868547989070533,
"learning_rate": 0.000996272195691521,
"loss": 4.0916,
"step": 597
},
{
"epoch": 0.07,
"grad_norm": 1.512219098700672,
"learning_rate": 0.0009962495282432255,
"loss": 4.0525,
"step": 598
},
{
"epoch": 0.07,
"grad_norm": 2.966362752698928,
"learning_rate": 0.0009962267923467672,
"loss": 4.1998,
"step": 599
},
{
"epoch": 0.07,
"grad_norm": 1.8545509706675711,
"learning_rate": 0.0009962039880052817,
"loss": 4.2738,
"step": 600
},
{
"epoch": 0.07,
"grad_norm": 1.612391865955191,
"learning_rate": 0.0009961811152219148,
"loss": 4.3363,
"step": 601
},
{
"epoch": 0.07,
"grad_norm": 2.215275682880689,
"learning_rate": 0.0009961581739998209,
"loss": 4.1228,
"step": 602
},
{
"epoch": 0.07,
"grad_norm": 1.5388436719900926,
"learning_rate": 0.0009961351643421646,
"loss": 4.2861,
"step": 603
},
{
"epoch": 0.07,
"grad_norm": 3.26413100231765,
"learning_rate": 0.0009961120862521195,
"loss": 4.1929,
"step": 604
},
{
"epoch": 0.07,
"grad_norm": 1.7755420053851467,
"learning_rate": 0.000996088939732869,
"loss": 4.0446,
"step": 605
},
{
"epoch": 0.07,
"grad_norm": 2.051055947769499,
"learning_rate": 0.0009960657247876056,
"loss": 4.2947,
"step": 606
},
{
"epoch": 0.07,
"grad_norm": 1.85108045901961,
"learning_rate": 0.000996042441419531,
"loss": 4.1702,
"step": 607
},
{
"epoch": 0.07,
"grad_norm": 1.7304917268419593,
"learning_rate": 0.0009960190896318572,
"loss": 4.0421,
"step": 608
},
{
"epoch": 0.07,
"grad_norm": 1.2556119399054029,
"learning_rate": 0.0009959956694278052,
"loss": 3.9978,
"step": 609
},
{
"epoch": 0.07,
"grad_norm": 1.6370960465991575,
"learning_rate": 0.000995972180810605,
"loss": 3.9639,
"step": 610
},
{
"epoch": 0.07,
"grad_norm": 1.5581173571942428,
"learning_rate": 0.0009959486237834964,
"loss": 3.897,
"step": 611
},
{
"epoch": 0.07,
"grad_norm": 2.3889100125210514,
"learning_rate": 0.0009959249983497289,
"loss": 4.269,
"step": 612
},
{
"epoch": 0.07,
"grad_norm": 1.4596146156794254,
"learning_rate": 0.0009959013045125612,
"loss": 4.1034,
"step": 613
},
{
"epoch": 0.07,
"grad_norm": 1.575489312922,
"learning_rate": 0.000995877542275261,
"loss": 4.0463,
"step": 614
},
{
"epoch": 0.07,
"grad_norm": 1.3724750625373647,
"learning_rate": 0.0009958537116411064,
"loss": 4.0892,
"step": 615
},
{
"epoch": 0.07,
"grad_norm": 1.4147832091311163,
"learning_rate": 0.000995829812613384,
"loss": 4.2942,
"step": 616
},
{
"epoch": 0.07,
"grad_norm": 1.545629033846108,
"learning_rate": 0.0009958058451953902,
"loss": 4.1335,
"step": 617
},
{
"epoch": 0.07,
"grad_norm": 1.382987493144587,
"learning_rate": 0.0009957818093904313,
"loss": 4.2096,
"step": 618
},
{
"epoch": 0.07,
"grad_norm": 1.3620340266808024,
"learning_rate": 0.000995757705201822,
"loss": 4.2155,
"step": 619
},
{
"epoch": 0.07,
"grad_norm": 2.087045663576736,
"learning_rate": 0.0009957335326328874,
"loss": 4.2768,
"step": 620
},
{
"epoch": 0.07,
"grad_norm": 1.5217177262847221,
"learning_rate": 0.0009957092916869613,
"loss": 4.1648,
"step": 621
},
{
"epoch": 0.07,
"grad_norm": 1.5085147186425738,
"learning_rate": 0.0009956849823673877,
"loss": 3.9862,
"step": 622
},
{
"epoch": 0.07,
"grad_norm": 1.2811490069695686,
"learning_rate": 0.0009956606046775192,
"loss": 4.1947,
"step": 623
},
{
"epoch": 0.07,
"grad_norm": 1.4784842952015558,
"learning_rate": 0.0009956361586207186,
"loss": 4.0288,
"step": 624
},
{
"epoch": 0.07,
"grad_norm": 3.001338900911761,
"learning_rate": 0.0009956116442003575,
"loss": 4.1915,
"step": 625
},
{
"epoch": 0.07,
"grad_norm": 1.7617320475361695,
"learning_rate": 0.0009955870614198174,
"loss": 3.9299,
"step": 626
},
{
"epoch": 0.07,
"grad_norm": 2.6332876781789483,
"learning_rate": 0.000995562410282489,
"loss": 4.2955,
"step": 627
},
{
"epoch": 0.07,
"grad_norm": 1.3639849609347612,
"learning_rate": 0.0009955376907917722,
"loss": 3.9406,
"step": 628
},
{
"epoch": 0.07,
"grad_norm": 1.5138907550520855,
"learning_rate": 0.0009955129029510768,
"loss": 4.2578,
"step": 629
},
{
"epoch": 0.07,
"grad_norm": 1.7896493816680665,
"learning_rate": 0.0009954880467638219,
"loss": 4.1592,
"step": 630
},
{
"epoch": 0.07,
"grad_norm": 1.235994667315824,
"learning_rate": 0.0009954631222334356,
"loss": 4.2443,
"step": 631
},
{
"epoch": 0.07,
"grad_norm": 2.707557780772975,
"learning_rate": 0.0009954381293633561,
"loss": 4.2409,
"step": 632
},
{
"epoch": 0.07,
"grad_norm": 1.4183097961636217,
"learning_rate": 0.0009954130681570305,
"loss": 4.1186,
"step": 633
},
{
"epoch": 0.07,
"grad_norm": 1.7298670716087088,
"learning_rate": 0.0009953879386179157,
"loss": 4.3454,
"step": 634
},
{
"epoch": 0.07,
"grad_norm": 3.949787643026631,
"learning_rate": 0.0009953627407494777,
"loss": 4.2464,
"step": 635
},
{
"epoch": 0.07,
"grad_norm": 2.532381439273023,
"learning_rate": 0.000995337474555192,
"loss": 4.0315,
"step": 636
},
{
"epoch": 0.07,
"grad_norm": 1.6807418462402264,
"learning_rate": 0.0009953121400385438,
"loss": 3.9328,
"step": 637
},
{
"epoch": 0.07,
"grad_norm": 1.8488274304785741,
"learning_rate": 0.0009952867372030273,
"loss": 4.2027,
"step": 638
},
{
"epoch": 0.07,
"grad_norm": 1.5004804954449449,
"learning_rate": 0.0009952612660521466,
"loss": 4.1245,
"step": 639
},
{
"epoch": 0.07,
"grad_norm": 1.5083268156226164,
"learning_rate": 0.0009952357265894146,
"loss": 4.0478,
"step": 640
},
{
"epoch": 0.07,
"grad_norm": 2.22439978004962,
"learning_rate": 0.000995210118818354,
"loss": 4.1608,
"step": 641
},
{
"epoch": 0.07,
"grad_norm": 1.9152396601119313,
"learning_rate": 0.0009951844427424973,
"loss": 4.1071,
"step": 642
},
{
"epoch": 0.07,
"grad_norm": 3.633611473231003,
"learning_rate": 0.0009951586983653858,
"loss": 4.1028,
"step": 643
},
{
"epoch": 0.07,
"grad_norm": 1.3858037616390062,
"learning_rate": 0.0009951328856905703,
"loss": 4.0111,
"step": 644
},
{
"epoch": 0.07,
"grad_norm": 1.446600371331332,
"learning_rate": 0.0009951070047216116,
"loss": 4.1573,
"step": 645
},
{
"epoch": 0.07,
"grad_norm": 1.454007103373636,
"learning_rate": 0.000995081055462079,
"loss": 4.1426,
"step": 646
},
{
"epoch": 0.07,
"grad_norm": 5.429911994205693,
"learning_rate": 0.0009950550379155519,
"loss": 4.0318,
"step": 647
},
{
"epoch": 0.07,
"grad_norm": 1.8483374877344385,
"learning_rate": 0.000995028952085619,
"loss": 4.0266,
"step": 648
},
{
"epoch": 0.07,
"grad_norm": 1.7012374213836405,
"learning_rate": 0.0009950027979758781,
"loss": 4.023,
"step": 649
},
{
"epoch": 0.07,
"grad_norm": 1.4842828652408195,
"learning_rate": 0.0009949765755899369,
"loss": 4.1377,
"step": 650
},
{
"epoch": 0.07,
"grad_norm": 5.12346651183532,
"learning_rate": 0.0009949502849314123,
"loss": 4.2203,
"step": 651
},
{
"epoch": 0.07,
"grad_norm": 1.9863785238513911,
"learning_rate": 0.0009949239260039304,
"loss": 4.1451,
"step": 652
},
{
"epoch": 0.07,
"grad_norm": 1.5733648107746758,
"learning_rate": 0.0009948974988111272,
"loss": 4.0476,
"step": 653
},
{
"epoch": 0.07,
"grad_norm": 1.3851246908349588,
"learning_rate": 0.0009948710033566475,
"loss": 3.9108,
"step": 654
},
{
"epoch": 0.08,
"grad_norm": 1.413087596958269,
"learning_rate": 0.000994844439644146,
"loss": 3.9836,
"step": 655
},
{
"epoch": 0.08,
"grad_norm": 1.5858218947834306,
"learning_rate": 0.0009948178076772867,
"loss": 4.0915,
"step": 656
},
{
"epoch": 0.08,
"grad_norm": 1.580882372405347,
"learning_rate": 0.0009947911074597428,
"loss": 4.2436,
"step": 657
},
{
"epoch": 0.08,
"grad_norm": 1.6617444315008543,
"learning_rate": 0.0009947643389951973,
"loss": 4.2557,
"step": 658
},
{
"epoch": 0.08,
"grad_norm": 1.4136657364986869,
"learning_rate": 0.0009947375022873422,
"loss": 4.1125,
"step": 659
},
{
"epoch": 0.08,
"grad_norm": 1.4282784659608772,
"learning_rate": 0.0009947105973398794,
"loss": 3.9847,
"step": 660
},
{
"epoch": 0.08,
"grad_norm": 1.802011690932386,
"learning_rate": 0.0009946836241565195,
"loss": 4.5018,
"step": 661
},
{
"epoch": 0.08,
"grad_norm": 1.762425890699875,
"learning_rate": 0.0009946565827409833,
"loss": 3.9913,
"step": 662
},
{
"epoch": 0.08,
"grad_norm": 1.4064618593143094,
"learning_rate": 0.0009946294730970005,
"loss": 4.24,
"step": 663
},
{
"epoch": 0.08,
"grad_norm": 2.057643590301231,
"learning_rate": 0.0009946022952283106,
"loss": 4.0534,
"step": 664
},
{
"epoch": 0.08,
"grad_norm": 1.3660268829451327,
"learning_rate": 0.0009945750491386616,
"loss": 4.1487,
"step": 665
},
{
"epoch": 0.08,
"grad_norm": 2.460997056810734,
"learning_rate": 0.0009945477348318123,
"loss": 4.351,
"step": 666
},
{
"epoch": 0.08,
"grad_norm": 1.5412447154686766,
"learning_rate": 0.00099452035231153,
"loss": 4.2104,
"step": 667
},
{
"epoch": 0.08,
"grad_norm": 3.3804350220699337,
"learning_rate": 0.0009944929015815913,
"loss": 4.1453,
"step": 668
},
{
"epoch": 0.08,
"grad_norm": 1.9147446832418717,
"learning_rate": 0.0009944653826457828,
"loss": 3.9929,
"step": 669
},
{
"epoch": 0.08,
"grad_norm": 1.3330460143249674,
"learning_rate": 0.0009944377955079004,
"loss": 4.0753,
"step": 670
},
{
"epoch": 0.08,
"grad_norm": 1.3134469374813962,
"learning_rate": 0.0009944101401717486,
"loss": 4.0162,
"step": 671
},
{
"epoch": 0.08,
"grad_norm": 1.486141109678482,
"learning_rate": 0.0009943824166411424,
"loss": 4.0119,
"step": 672
},
{
"epoch": 0.08,
"grad_norm": 1.5920173618340074,
"learning_rate": 0.0009943546249199056,
"loss": 4.0771,
"step": 673
},
{
"epoch": 0.08,
"grad_norm": 3.783770288620055,
"learning_rate": 0.0009943267650118716,
"loss": 3.8222,
"step": 674
},
{
"epoch": 0.08,
"grad_norm": 1.5883219744980457,
"learning_rate": 0.0009942988369208829,
"loss": 4.1278,
"step": 675
},
{
"epoch": 0.08,
"grad_norm": 4.252308145309663,
"learning_rate": 0.000994270840650792,
"loss": 3.8363,
"step": 676
},
{
"epoch": 0.08,
"grad_norm": 1.2879696726784864,
"learning_rate": 0.0009942427762054604,
"loss": 3.9805,
"step": 677
},
{
"epoch": 0.08,
"grad_norm": 2.1959768053912816,
"learning_rate": 0.0009942146435887589,
"loss": 3.9806,
"step": 678
},
{
"epoch": 0.08,
"grad_norm": 1.3530639059438625,
"learning_rate": 0.0009941864428045677,
"loss": 4.2631,
"step": 679
},
{
"epoch": 0.08,
"grad_norm": 1.234259697603252,
"learning_rate": 0.0009941581738567768,
"loss": 4.2267,
"step": 680
},
{
"epoch": 0.08,
"grad_norm": 1.5186806563407114,
"learning_rate": 0.0009941298367492854,
"loss": 4.3256,
"step": 681
},
{
"epoch": 0.08,
"grad_norm": 1.6869551109394108,
"learning_rate": 0.0009941014314860021,
"loss": 4.2521,
"step": 682
},
{
"epoch": 0.08,
"grad_norm": 1.399775753515481,
"learning_rate": 0.0009940729580708448,
"loss": 4.1332,
"step": 683
},
{
"epoch": 0.08,
"grad_norm": 1.4880893351267843,
"learning_rate": 0.0009940444165077408,
"loss": 4.1871,
"step": 684
},
{
"epoch": 0.08,
"grad_norm": 2.0846614972362167,
"learning_rate": 0.0009940158068006267,
"loss": 4.2293,
"step": 685
},
{
"epoch": 0.08,
"grad_norm": 1.4512658006094312,
"learning_rate": 0.0009939871289534488,
"loss": 4.0961,
"step": 686
},
{
"epoch": 0.08,
"grad_norm": 2.213945903257091,
"learning_rate": 0.0009939583829701628,
"loss": 4.074,
"step": 687
},
{
"epoch": 0.08,
"grad_norm": 1.8919105099835127,
"learning_rate": 0.0009939295688547337,
"loss": 4.1389,
"step": 688
},
{
"epoch": 0.08,
"grad_norm": 1.6391367835023194,
"learning_rate": 0.0009939006866111356,
"loss": 4.3763,
"step": 689
},
{
"epoch": 0.08,
"grad_norm": 1.5376709921483438,
"learning_rate": 0.0009938717362433524,
"loss": 4.1265,
"step": 690
},
{
"epoch": 0.08,
"grad_norm": 1.8821901951745228,
"learning_rate": 0.0009938427177553773,
"loss": 4.2587,
"step": 691
},
{
"epoch": 0.08,
"grad_norm": 1.9187971881871668,
"learning_rate": 0.0009938136311512127,
"loss": 4.1903,
"step": 692
},
{
"epoch": 0.08,
"grad_norm": 1.379118461132903,
"learning_rate": 0.0009937844764348707,
"loss": 3.9662,
"step": 693
},
{
"epoch": 0.08,
"grad_norm": 3.3290556733092025,
"learning_rate": 0.0009937552536103727,
"loss": 4.1497,
"step": 694
},
{
"epoch": 0.08,
"grad_norm": 2.2849707521158082,
"learning_rate": 0.000993725962681749,
"loss": 4.1624,
"step": 695
},
{
"epoch": 0.08,
"grad_norm": 3.731279486405896,
"learning_rate": 0.0009936966036530402,
"loss": 3.9731,
"step": 696
},
{
"epoch": 0.08,
"grad_norm": 1.7043471733458793,
"learning_rate": 0.0009936671765282956,
"loss": 4.3333,
"step": 697
},
{
"epoch": 0.08,
"grad_norm": 1.7853690135525797,
"learning_rate": 0.0009936376813115741,
"loss": 3.853,
"step": 698
},
{
"epoch": 0.08,
"grad_norm": 2.039520411297824,
"learning_rate": 0.000993608118006944,
"loss": 4.1309,
"step": 699
},
{
"epoch": 0.08,
"grad_norm": 1.5822555739936295,
"learning_rate": 0.0009935784866184833,
"loss": 4.1114,
"step": 700
},
{
"epoch": 0.08,
"grad_norm": 6.350469745437764,
"learning_rate": 0.0009935487871502787,
"loss": 4.1319,
"step": 701
},
{
"epoch": 0.08,
"grad_norm": 3.5098705682611695,
"learning_rate": 0.0009935190196064267,
"loss": 3.996,
"step": 702
},
{
"epoch": 0.08,
"grad_norm": 1.8037871243348724,
"learning_rate": 0.0009934891839910333,
"loss": 4.1872,
"step": 703
},
{
"epoch": 0.08,
"grad_norm": 4.420987942270726,
"learning_rate": 0.0009934592803082138,
"loss": 4.0095,
"step": 704
},
{
"epoch": 0.08,
"grad_norm": 2.3756940275583855,
"learning_rate": 0.0009934293085620929,
"loss": 4.4257,
"step": 705
},
{
"epoch": 0.08,
"grad_norm": 2.2349550904005206,
"learning_rate": 0.0009933992687568044,
"loss": 3.9381,
"step": 706
},
{
"epoch": 0.08,
"grad_norm": 1.2737657961091353,
"learning_rate": 0.0009933691608964917,
"loss": 4.2288,
"step": 707
},
{
"epoch": 0.08,
"grad_norm": 1.4877276508498172,
"learning_rate": 0.0009933389849853078,
"loss": 4.0222,
"step": 708
},
{
"epoch": 0.08,
"grad_norm": 1.9461136730740822,
"learning_rate": 0.0009933087410274148,
"loss": 4.1435,
"step": 709
},
{
"epoch": 0.08,
"grad_norm": 1.608162646122769,
"learning_rate": 0.0009932784290269843,
"loss": 4.2694,
"step": 710
},
{
"epoch": 0.08,
"grad_norm": 1.782466565429436,
"learning_rate": 0.0009932480489881974,
"loss": 4.1623,
"step": 711
},
{
"epoch": 0.08,
"grad_norm": 1.6479064806227908,
"learning_rate": 0.0009932176009152442,
"loss": 4.2787,
"step": 712
},
{
"epoch": 0.08,
"grad_norm": 1.6435651627528534,
"learning_rate": 0.0009931870848123245,
"loss": 4.2694,
"step": 713
},
{
"epoch": 0.08,
"grad_norm": 1.4703496447852837,
"learning_rate": 0.0009931565006836476,
"loss": 4.0805,
"step": 714
},
{
"epoch": 0.08,
"grad_norm": 1.4829289788949316,
"learning_rate": 0.0009931258485334315,
"loss": 4.2708,
"step": 715
},
{
"epoch": 0.08,
"grad_norm": 2.5900367960834214,
"learning_rate": 0.0009930951283659048,
"loss": 3.891,
"step": 716
},
{
"epoch": 0.08,
"grad_norm": 1.8875196289077798,
"learning_rate": 0.0009930643401853043,
"loss": 4.0185,
"step": 717
},
{
"epoch": 0.08,
"grad_norm": 1.461788156993926,
"learning_rate": 0.0009930334839958765,
"loss": 4.1196,
"step": 718
},
{
"epoch": 0.08,
"grad_norm": 1.5222977257512633,
"learning_rate": 0.000993002559801878,
"loss": 4.0555,
"step": 719
},
{
"epoch": 0.08,
"grad_norm": 3.8503979664643304,
"learning_rate": 0.0009929715676075736,
"loss": 3.9817,
"step": 720
},
{
"epoch": 0.08,
"grad_norm": 2.053695215129397,
"learning_rate": 0.0009929405074172383,
"loss": 4.4499,
"step": 721
},
{
"epoch": 0.08,
"grad_norm": 1.531752112452093,
"learning_rate": 0.0009929093792351567,
"loss": 3.9741,
"step": 722
},
{
"epoch": 0.08,
"grad_norm": 4.082202578649599,
"learning_rate": 0.0009928781830656215,
"loss": 4.3045,
"step": 723
},
{
"epoch": 0.08,
"grad_norm": 1.9048628451082397,
"learning_rate": 0.0009928469189129363,
"loss": 4.2997,
"step": 724
},
{
"epoch": 0.08,
"grad_norm": 1.3102157689427596,
"learning_rate": 0.0009928155867814131,
"loss": 4.1007,
"step": 725
},
{
"epoch": 0.08,
"grad_norm": 1.3166056446145378,
"learning_rate": 0.0009927841866753735,
"loss": 4.2061,
"step": 726
},
{
"epoch": 0.08,
"grad_norm": 1.712375209673574,
"learning_rate": 0.000992752718599149,
"loss": 4.3986,
"step": 727
},
{
"epoch": 0.08,
"grad_norm": 1.4959218245341306,
"learning_rate": 0.0009927211825570793,
"loss": 4.0061,
"step": 728
},
{
"epoch": 0.08,
"grad_norm": 2.137165697288484,
"learning_rate": 0.000992689578553515,
"loss": 4.0319,
"step": 729
},
{
"epoch": 0.08,
"grad_norm": 1.3762459943741774,
"learning_rate": 0.0009926579065928144,
"loss": 3.8738,
"step": 730
},
{
"epoch": 0.08,
"grad_norm": 2.5979882660965306,
"learning_rate": 0.000992626166679347,
"loss": 4.1582,
"step": 731
},
{
"epoch": 0.08,
"grad_norm": 1.7174436227796919,
"learning_rate": 0.0009925943588174897,
"loss": 4.0937,
"step": 732
},
{
"epoch": 0.08,
"grad_norm": 2.379061018713314,
"learning_rate": 0.0009925624830116305,
"loss": 4.3185,
"step": 733
},
{
"epoch": 0.08,
"grad_norm": 1.9172839175239496,
"learning_rate": 0.000992530539266166,
"loss": 4.1907,
"step": 734
},
{
"epoch": 0.08,
"grad_norm": 1.4845328136986566,
"learning_rate": 0.0009924985275855018,
"loss": 4.1901,
"step": 735
},
{
"epoch": 0.08,
"grad_norm": 1.9436011884517015,
"learning_rate": 0.000992466447974054,
"loss": 3.9915,
"step": 736
},
{
"epoch": 0.08,
"grad_norm": 1.5635948131574493,
"learning_rate": 0.0009924343004362466,
"loss": 4.004,
"step": 737
},
{
"epoch": 0.08,
"grad_norm": 1.6891859420652,
"learning_rate": 0.0009924020849765142,
"loss": 4.0188,
"step": 738
},
{
"epoch": 0.08,
"grad_norm": 1.8613933117912842,
"learning_rate": 0.0009923698015993003,
"loss": 3.8481,
"step": 739
},
{
"epoch": 0.08,
"grad_norm": 1.3842663601801606,
"learning_rate": 0.0009923374503090577,
"loss": 4.2203,
"step": 740
},
{
"epoch": 0.08,
"grad_norm": 2.09529312034581,
"learning_rate": 0.0009923050311102487,
"loss": 4.0311,
"step": 741
},
{
"epoch": 0.09,
"grad_norm": 1.6233330964871857,
"learning_rate": 0.0009922725440073446,
"loss": 4.2288,
"step": 742
},
{
"epoch": 0.09,
"grad_norm": 1.5915011985174687,
"learning_rate": 0.0009922399890048268,
"loss": 3.8865,
"step": 743
},
{
"epoch": 0.09,
"grad_norm": 1.3742105686216515,
"learning_rate": 0.0009922073661071855,
"loss": 4.0008,
"step": 744
},
{
"epoch": 0.09,
"grad_norm": 2.0608412808373853,
"learning_rate": 0.0009921746753189203,
"loss": 4.1446,
"step": 745
},
{
"epoch": 0.09,
"grad_norm": 1.5888780670070313,
"learning_rate": 0.0009921419166445404,
"loss": 4.0121,
"step": 746
},
{
"epoch": 0.09,
"grad_norm": 1.265541328055408,
"learning_rate": 0.0009921090900885641,
"loss": 4.1654,
"step": 747
},
{
"epoch": 0.09,
"grad_norm": 1.4874840245069292,
"learning_rate": 0.0009920761956555193,
"loss": 4.0256,
"step": 748
},
{
"epoch": 0.09,
"grad_norm": 2.23067104817785,
"learning_rate": 0.0009920432333499433,
"loss": 4.1285,
"step": 749
},
{
"epoch": 0.09,
"grad_norm": 1.803586401300367,
"learning_rate": 0.0009920102031763822,
"loss": 4.1161,
"step": 750
},
{
"epoch": 0.09,
"grad_norm": 1.2950900331867707,
"learning_rate": 0.0009919771051393922,
"loss": 4.0044,
"step": 751
},
{
"epoch": 0.09,
"grad_norm": 3.6537802139836675,
"learning_rate": 0.0009919439392435385,
"loss": 3.9011,
"step": 752
},
{
"epoch": 0.09,
"grad_norm": 1.6266253975019092,
"learning_rate": 0.0009919107054933956,
"loss": 4.2069,
"step": 753
},
{
"epoch": 0.09,
"grad_norm": 1.3100424463049318,
"learning_rate": 0.0009918774038935477,
"loss": 4.2516,
"step": 754
},
{
"epoch": 0.09,
"grad_norm": 1.568253036461531,
"learning_rate": 0.000991844034448588,
"loss": 3.9562,
"step": 755
},
{
"epoch": 0.09,
"grad_norm": 2.2350638813150208,
"learning_rate": 0.000991810597163119,
"loss": 4.0138,
"step": 756
},
{
"epoch": 0.09,
"grad_norm": 1.5601470900280976,
"learning_rate": 0.000991777092041753,
"loss": 3.9203,
"step": 757
},
{
"epoch": 0.09,
"grad_norm": 1.1019994123823387,
"learning_rate": 0.0009917435190891111,
"loss": 3.9658,
"step": 758
},
{
"epoch": 0.09,
"grad_norm": 4.436430897466474,
"learning_rate": 0.0009917098783098243,
"loss": 4.0674,
"step": 759
},
{
"epoch": 0.09,
"grad_norm": 1.5380130199940163,
"learning_rate": 0.0009916761697085327,
"loss": 4.2066,
"step": 760
},
{
"epoch": 0.09,
"grad_norm": 1.799214650424404,
"learning_rate": 0.0009916423932898857,
"loss": 4.0897,
"step": 761
},
{
"epoch": 0.09,
"grad_norm": 1.9883547786814795,
"learning_rate": 0.0009916085490585423,
"loss": 4.1879,
"step": 762
},
{
"epoch": 0.09,
"grad_norm": 1.8271889789986793,
"learning_rate": 0.0009915746370191701,
"loss": 4.1315,
"step": 763
},
{
"epoch": 0.09,
"grad_norm": 1.2990876773015267,
"learning_rate": 0.0009915406571764471,
"loss": 4.1213,
"step": 764
},
{
"epoch": 0.09,
"grad_norm": 2.312643638462174,
"learning_rate": 0.0009915066095350603,
"loss": 4.2784,
"step": 765
},
{
"epoch": 0.09,
"grad_norm": 3.437472453934804,
"learning_rate": 0.0009914724940997053,
"loss": 4.269,
"step": 766
},
{
"epoch": 0.09,
"grad_norm": 2.391918574393383,
"learning_rate": 0.0009914383108750883,
"loss": 3.9158,
"step": 767
},
{
"epoch": 0.09,
"grad_norm": 2.459397033169588,
"learning_rate": 0.000991404059865924,
"loss": 4.254,
"step": 768
},
{
"epoch": 0.09,
"grad_norm": 1.5056946961373214,
"learning_rate": 0.0009913697410769366,
"loss": 3.9395,
"step": 769
},
{
"epoch": 0.09,
"grad_norm": 1.588335241429291,
"learning_rate": 0.0009913353545128597,
"loss": 4.095,
"step": 770
},
{
"epoch": 0.09,
"grad_norm": 2.957160013007683,
"learning_rate": 0.0009913009001784364,
"loss": 4.3068,
"step": 771
},
{
"epoch": 0.09,
"grad_norm": 2.7564536672978432,
"learning_rate": 0.0009912663780784188,
"loss": 4.2376,
"step": 772
},
{
"epoch": 0.09,
"grad_norm": 1.9578620602619432,
"learning_rate": 0.000991231788217569,
"loss": 3.9008,
"step": 773
},
{
"epoch": 0.09,
"grad_norm": 2.0411401208523645,
"learning_rate": 0.0009911971306006575,
"loss": 4.148,
"step": 774
},
{
"epoch": 0.09,
"grad_norm": 1.3266303336156848,
"learning_rate": 0.000991162405232465,
"loss": 4.0964,
"step": 775
},
{
"epoch": 0.09,
"grad_norm": 1.7313435379031945,
"learning_rate": 0.0009911276121177812,
"loss": 4.4221,
"step": 776
},
{
"epoch": 0.09,
"grad_norm": 1.5773925852216288,
"learning_rate": 0.0009910927512614051,
"loss": 4.2298,
"step": 777
},
{
"epoch": 0.09,
"grad_norm": 1.7923144358760363,
"learning_rate": 0.000991057822668145,
"loss": 4.1448,
"step": 778
},
{
"epoch": 0.09,
"grad_norm": 1.9762495520882326,
"learning_rate": 0.0009910228263428186,
"loss": 4.053,
"step": 779
},
{
"epoch": 0.09,
"grad_norm": 1.364840414866272,
"learning_rate": 0.000990987762290253,
"loss": 3.9459,
"step": 780
},
{
"epoch": 0.09,
"grad_norm": 1.3170548492412446,
"learning_rate": 0.0009909526305152848,
"loss": 4.1839,
"step": 781
},
{
"epoch": 0.09,
"grad_norm": 1.4851904515030712,
"learning_rate": 0.0009909174310227596,
"loss": 3.9447,
"step": 782
},
{
"epoch": 0.09,
"grad_norm": 1.2615054080730894,
"learning_rate": 0.0009908821638175325,
"loss": 3.9618,
"step": 783
},
{
"epoch": 0.09,
"grad_norm": 1.4097100490736516,
"learning_rate": 0.000990846828904468,
"loss": 4.3071,
"step": 784
},
{
"epoch": 0.09,
"grad_norm": 2.360200741587504,
"learning_rate": 0.0009908114262884397,
"loss": 4.2358,
"step": 785
},
{
"epoch": 0.09,
"grad_norm": 17.20122497542913,
"learning_rate": 0.0009907759559743311,
"loss": 4.0454,
"step": 786
},
{
"epoch": 0.09,
"grad_norm": 1.6158886209071366,
"learning_rate": 0.0009907404179670342,
"loss": 4.074,
"step": 787
},
{
"epoch": 0.09,
"grad_norm": 2.2709370867141287,
"learning_rate": 0.000990704812271451,
"loss": 4.2261,
"step": 788
},
{
"epoch": 0.09,
"grad_norm": 1.255600981552734,
"learning_rate": 0.0009906691388924928,
"loss": 4.376,
"step": 789
},
{
"epoch": 0.09,
"grad_norm": 1.629476776989314,
"learning_rate": 0.0009906333978350799,
"loss": 3.9338,
"step": 790
},
{
"epoch": 0.09,
"grad_norm": 17.38085593199682,
"learning_rate": 0.000990597589104142,
"loss": 4.0696,
"step": 791
},
{
"epoch": 0.09,
"grad_norm": 8.036231504489686,
"learning_rate": 0.0009905617127046182,
"loss": 3.9258,
"step": 792
},
{
"epoch": 0.09,
"grad_norm": 1.4429522426425332,
"learning_rate": 0.0009905257686414573,
"loss": 4.0944,
"step": 793
},
{
"epoch": 0.09,
"grad_norm": 1.3579793736907337,
"learning_rate": 0.0009904897569196168,
"loss": 4.2673,
"step": 794
},
{
"epoch": 0.09,
"grad_norm": 1.3456311024088905,
"learning_rate": 0.0009904536775440641,
"loss": 3.9632,
"step": 795
},
{
"epoch": 0.09,
"grad_norm": 1.18416963357136,
"learning_rate": 0.0009904175305197752,
"loss": 4.0098,
"step": 796
},
{
"epoch": 0.09,
"grad_norm": 1.781172523127943,
"learning_rate": 0.0009903813158517363,
"loss": 4.0676,
"step": 797
},
{
"epoch": 0.09,
"grad_norm": 1.5266544065259735,
"learning_rate": 0.0009903450335449423,
"loss": 4.2542,
"step": 798
},
{
"epoch": 0.09,
"grad_norm": 1.744801038215624,
"learning_rate": 0.0009903086836043978,
"loss": 4.2541,
"step": 799
},
{
"epoch": 0.09,
"grad_norm": 1.4002368410495831,
"learning_rate": 0.0009902722660351166,
"loss": 4.1107,
"step": 800
},
{
"epoch": 0.09,
"grad_norm": 1.286801490908429,
"learning_rate": 0.0009902357808421218,
"loss": 3.9083,
"step": 801
},
{
"epoch": 0.09,
"grad_norm": 1.6759711857123307,
"learning_rate": 0.0009901992280304456,
"loss": 4.1167,
"step": 802
},
{
"epoch": 0.09,
"grad_norm": 1.3345841796770548,
"learning_rate": 0.00099016260760513,
"loss": 4.1082,
"step": 803
},
{
"epoch": 0.09,
"grad_norm": 2.1475747215420826,
"learning_rate": 0.000990125919571226,
"loss": 4.0635,
"step": 804
},
{
"epoch": 0.09,
"grad_norm": 2.2426641437880592,
"learning_rate": 0.000990089163933794,
"loss": 4.051,
"step": 805
},
{
"epoch": 0.09,
"grad_norm": 9.29529912028014,
"learning_rate": 0.000990052340697904,
"loss": 4.2248,
"step": 806
},
{
"epoch": 0.09,
"grad_norm": 1.441681225478933,
"learning_rate": 0.0009900154498686349,
"loss": 4.2893,
"step": 807
},
{
"epoch": 0.09,
"grad_norm": 1.3589736316717063,
"learning_rate": 0.0009899784914510748,
"loss": 3.811,
"step": 808
},
{
"epoch": 0.09,
"grad_norm": 1.5766839084839277,
"learning_rate": 0.0009899414654503216,
"loss": 4.1897,
"step": 809
},
{
"epoch": 0.09,
"grad_norm": 7.399332756469976,
"learning_rate": 0.0009899043718714826,
"loss": 3.8399,
"step": 810
},
{
"epoch": 0.09,
"grad_norm": 1.6649543075900002,
"learning_rate": 0.0009898672107196739,
"loss": 4.2661,
"step": 811
},
{
"epoch": 0.09,
"grad_norm": 1.8014402286618858,
"learning_rate": 0.000989829982000021,
"loss": 4.1699,
"step": 812
},
{
"epoch": 0.09,
"grad_norm": 1.56731257169305,
"learning_rate": 0.000989792685717659,
"loss": 3.8234,
"step": 813
},
{
"epoch": 0.09,
"grad_norm": 1.3844508086333362,
"learning_rate": 0.0009897553218777327,
"loss": 4.3748,
"step": 814
},
{
"epoch": 0.09,
"grad_norm": 1.4379101979071727,
"learning_rate": 0.000989717890485395,
"loss": 4.0049,
"step": 815
},
{
"epoch": 0.09,
"grad_norm": 2.569345715157095,
"learning_rate": 0.0009896803915458094,
"loss": 4.0613,
"step": 816
},
{
"epoch": 0.09,
"grad_norm": 1.4951417577470074,
"learning_rate": 0.0009896428250641479,
"loss": 4.0152,
"step": 817
},
{
"epoch": 0.09,
"grad_norm": 1.4142744268436234,
"learning_rate": 0.000989605191045592,
"loss": 4.1667,
"step": 818
},
{
"epoch": 0.09,
"grad_norm": 1.6652016973498989,
"learning_rate": 0.0009895674894953327,
"loss": 4.0481,
"step": 819
},
{
"epoch": 0.09,
"grad_norm": 2.129185343182342,
"learning_rate": 0.0009895297204185706,
"loss": 4.0972,
"step": 820
},
{
"epoch": 0.09,
"grad_norm": 1.4586831829493963,
"learning_rate": 0.0009894918838205145,
"loss": 4.176,
"step": 821
},
{
"epoch": 0.09,
"grad_norm": 1.7686321361429058,
"learning_rate": 0.0009894539797063837,
"loss": 4.0891,
"step": 822
},
{
"epoch": 0.09,
"grad_norm": 3.9028000496141932,
"learning_rate": 0.0009894160080814061,
"loss": 4.2762,
"step": 823
},
{
"epoch": 0.09,
"grad_norm": 1.6233650852294506,
"learning_rate": 0.0009893779689508194,
"loss": 4.2213,
"step": 824
},
{
"epoch": 0.09,
"grad_norm": 1.6995991967356299,
"learning_rate": 0.0009893398623198703,
"loss": 4.0469,
"step": 825
},
{
"epoch": 0.09,
"grad_norm": 1.7138268586315053,
"learning_rate": 0.0009893016881938148,
"loss": 4.4536,
"step": 826
},
{
"epoch": 0.09,
"grad_norm": 12.222807809258466,
"learning_rate": 0.0009892634465779185,
"loss": 4.0576,
"step": 827
},
{
"epoch": 0.09,
"grad_norm": 2.0979289498034968,
"learning_rate": 0.000989225137477456,
"loss": 4.099,
"step": 828
},
{
"epoch": 0.1,
"grad_norm": 1.7814119378572693,
"learning_rate": 0.000989186760897711,
"loss": 4.1562,
"step": 829
},
{
"epoch": 0.1,
"grad_norm": 1.478146420513788,
"learning_rate": 0.0009891483168439773,
"loss": 4.1868,
"step": 830
},
{
"epoch": 0.1,
"grad_norm": 3.714317827936107,
"learning_rate": 0.000989109805321557,
"loss": 4.1632,
"step": 831
},
{
"epoch": 0.1,
"grad_norm": 3.7841301342451863,
"learning_rate": 0.0009890712263357626,
"loss": 4.1657,
"step": 832
},
{
"epoch": 0.1,
"grad_norm": 1.5302849696184502,
"learning_rate": 0.000989032579891915,
"loss": 4.184,
"step": 833
},
{
"epoch": 0.1,
"grad_norm": 1.3819473192914429,
"learning_rate": 0.000988993865995345,
"loss": 4.1686,
"step": 834
},
{
"epoch": 0.1,
"grad_norm": 1.5227706756198385,
"learning_rate": 0.000988955084651392,
"loss": 4.1063,
"step": 835
},
{
"epoch": 0.1,
"grad_norm": 2.1564115222494316,
"learning_rate": 0.0009889162358654056,
"loss": 4.1291,
"step": 836
},
{
"epoch": 0.1,
"grad_norm": 4.230457294296131,
"learning_rate": 0.000988877319642744,
"loss": 4.2703,
"step": 837
},
{
"epoch": 0.1,
"grad_norm": 1.9964827995851897,
"learning_rate": 0.000988838335988775,
"loss": 4.1894,
"step": 838
},
{
"epoch": 0.1,
"grad_norm": 2.0406491401140276,
"learning_rate": 0.0009887992849088754,
"loss": 4.3152,
"step": 839
},
{
"epoch": 0.1,
"grad_norm": 1.6085421671755622,
"learning_rate": 0.000988760166408432,
"loss": 4.1769,
"step": 840
},
{
"epoch": 0.1,
"grad_norm": 1.8315553915979064,
"learning_rate": 0.0009887209804928404,
"loss": 4.1022,
"step": 841
},
{
"epoch": 0.1,
"grad_norm": 3.3794568203614013,
"learning_rate": 0.0009886817271675052,
"loss": 4.0518,
"step": 842
},
{
"epoch": 0.1,
"grad_norm": 2.1814928581227737,
"learning_rate": 0.000988642406437841,
"loss": 4.234,
"step": 843
},
{
"epoch": 0.1,
"grad_norm": 1.5869702261844612,
"learning_rate": 0.0009886030183092712,
"loss": 4.2866,
"step": 844
},
{
"epoch": 0.1,
"grad_norm": 1.5700726822071096,
"learning_rate": 0.0009885635627872285,
"loss": 4.2778,
"step": 845
},
{
"epoch": 0.1,
"grad_norm": 1.4491229621323376,
"learning_rate": 0.0009885240398771554,
"loss": 4.0254,
"step": 846
},
{
"epoch": 0.1,
"grad_norm": 2.8395167527650202,
"learning_rate": 0.0009884844495845029,
"loss": 3.837,
"step": 847
},
{
"epoch": 0.1,
"grad_norm": 2.0076759503758663,
"learning_rate": 0.000988444791914732,
"loss": 3.9375,
"step": 848
},
{
"epoch": 0.1,
"grad_norm": 4.3589814248214696,
"learning_rate": 0.0009884050668733126,
"loss": 3.9184,
"step": 849
},
{
"epoch": 0.1,
"grad_norm": 1.5590471550623999,
"learning_rate": 0.0009883652744657244,
"loss": 3.9813,
"step": 850
},
{
"epoch": 0.1,
"grad_norm": 2.4959186932592177,
"learning_rate": 0.0009883254146974554,
"loss": 4.064,
"step": 851
},
{
"epoch": 0.1,
"grad_norm": 1.9508550664621633,
"learning_rate": 0.0009882854875740037,
"loss": 3.9079,
"step": 852
},
{
"epoch": 0.1,
"grad_norm": 1.4567996861311256,
"learning_rate": 0.0009882454931008768,
"loss": 4.0238,
"step": 853
},
{
"epoch": 0.1,
"grad_norm": 1.4506773316372805,
"learning_rate": 0.0009882054312835907,
"loss": 4.0492,
"step": 854
},
{
"epoch": 0.1,
"grad_norm": 1.8847096976239732,
"learning_rate": 0.0009881653021276715,
"loss": 4.1245,
"step": 855
},
{
"epoch": 0.1,
"grad_norm": 1.4566847278233952,
"learning_rate": 0.0009881251056386541,
"loss": 4.1634,
"step": 856
},
{
"epoch": 0.1,
"grad_norm": 1.4388279201195637,
"learning_rate": 0.000988084841822083,
"loss": 4.0466,
"step": 857
},
{
"epoch": 0.1,
"grad_norm": 1.3152963008077352,
"learning_rate": 0.0009880445106835117,
"loss": 4.1887,
"step": 858
},
{
"epoch": 0.1,
"grad_norm": 1.829521836713052,
"learning_rate": 0.000988004112228503,
"loss": 3.9422,
"step": 859
},
{
"epoch": 0.1,
"grad_norm": 1.7683553482509518,
"learning_rate": 0.0009879636464626294,
"loss": 4.2288,
"step": 860
},
{
"epoch": 0.1,
"grad_norm": 1.5707412825756044,
"learning_rate": 0.0009879231133914721,
"loss": 4.0048,
"step": 861
},
{
"epoch": 0.1,
"grad_norm": 1.969529807325113,
"learning_rate": 0.000987882513020622,
"loss": 4.1848,
"step": 862
},
{
"epoch": 0.1,
"grad_norm": 1.3957050421234052,
"learning_rate": 0.000987841845355679,
"loss": 4.0344,
"step": 863
},
{
"epoch": 0.1,
"grad_norm": 1.3466275978937852,
"learning_rate": 0.0009878011104022526,
"loss": 4.2563,
"step": 864
},
{
"epoch": 0.1,
"grad_norm": 3.0018477168130997,
"learning_rate": 0.0009877603081659614,
"loss": 4.2019,
"step": 865
},
{
"epoch": 0.1,
"grad_norm": 3.6752736040736806,
"learning_rate": 0.0009877194386524334,
"loss": 4.0374,
"step": 866
},
{
"epoch": 0.1,
"grad_norm": 3.0727665360696284,
"learning_rate": 0.0009876785018673054,
"loss": 3.9671,
"step": 867
},
{
"epoch": 0.1,
"grad_norm": 1.7870156973958151,
"learning_rate": 0.0009876374978162242,
"loss": 4.1461,
"step": 868
},
{
"epoch": 0.1,
"grad_norm": 1.3735506606338912,
"learning_rate": 0.0009875964265048452,
"loss": 4.2198,
"step": 869
},
{
"epoch": 0.1,
"grad_norm": 1.3218542574864838,
"learning_rate": 0.0009875552879388336,
"loss": 3.9638,
"step": 870
},
{
"epoch": 0.1,
"grad_norm": 1.0943544952431903,
"learning_rate": 0.000987514082123864,
"loss": 4.0521,
"step": 871
},
{
"epoch": 0.1,
"grad_norm": 1.3252001668099544,
"learning_rate": 0.0009874728090656193,
"loss": 3.975,
"step": 872
},
{
"epoch": 0.1,
"grad_norm": 1.5014269548392112,
"learning_rate": 0.0009874314687697927,
"loss": 4.1303,
"step": 873
},
{
"epoch": 0.1,
"grad_norm": 1.5372271375653475,
"learning_rate": 0.0009873900612420866,
"loss": 3.867,
"step": 874
},
{
"epoch": 0.1,
"grad_norm": 1.2843747646745087,
"learning_rate": 0.0009873485864882116,
"loss": 4.1952,
"step": 875
},
{
"epoch": 0.1,
"grad_norm": 1.315868796542572,
"learning_rate": 0.000987307044513889,
"loss": 3.9562,
"step": 876
},
{
"epoch": 0.1,
"grad_norm": 1.6928113796533932,
"learning_rate": 0.0009872654353248486,
"loss": 4.1208,
"step": 877
},
{
"epoch": 0.1,
"grad_norm": 1.4173688015632502,
"learning_rate": 0.0009872237589268295,
"loss": 4.0893,
"step": 878
},
{
"epoch": 0.1,
"grad_norm": 2.2275560125875273,
"learning_rate": 0.00098718201532558,
"loss": 4.1059,
"step": 879
},
{
"epoch": 0.1,
"grad_norm": 5.122230574757189,
"learning_rate": 0.0009871402045268582,
"loss": 4.1319,
"step": 880
},
{
"epoch": 0.1,
"grad_norm": 1.476643187041516,
"learning_rate": 0.000987098326536431,
"loss": 4.0193,
"step": 881
},
{
"epoch": 0.1,
"grad_norm": 1.4040179738262668,
"learning_rate": 0.0009870563813600744,
"loss": 4.0415,
"step": 882
},
{
"epoch": 0.1,
"grad_norm": 2.886185077891304,
"learning_rate": 0.0009870143690035743,
"loss": 4.2009,
"step": 883
},
{
"epoch": 0.1,
"grad_norm": 1.9096872741562954,
"learning_rate": 0.0009869722894727251,
"loss": 3.9822,
"step": 884
},
{
"epoch": 0.1,
"grad_norm": 1.4362990754540939,
"learning_rate": 0.0009869301427733314,
"loss": 3.9824,
"step": 885
},
{
"epoch": 0.1,
"grad_norm": 3.007925592492388,
"learning_rate": 0.000986887928911206,
"loss": 4.1258,
"step": 886
},
{
"epoch": 0.1,
"grad_norm": 1.691359626621572,
"learning_rate": 0.0009868456478921719,
"loss": 4.0665,
"step": 887
},
{
"epoch": 0.1,
"grad_norm": 1.5900343477393342,
"learning_rate": 0.0009868032997220608,
"loss": 4.1633,
"step": 888
},
{
"epoch": 0.1,
"grad_norm": 2.105024800169,
"learning_rate": 0.0009867608844067136,
"loss": 4.0624,
"step": 889
},
{
"epoch": 0.1,
"grad_norm": 1.5609714372608663,
"learning_rate": 0.000986718401951981,
"loss": 4.0395,
"step": 890
},
{
"epoch": 0.1,
"grad_norm": 1.190038163136122,
"learning_rate": 0.0009866758523637228,
"loss": 4.1511,
"step": 891
},
{
"epoch": 0.1,
"grad_norm": 1.6345611315285546,
"learning_rate": 0.0009866332356478075,
"loss": 3.9277,
"step": 892
},
{
"epoch": 0.1,
"grad_norm": 1.4068003927564272,
"learning_rate": 0.000986590551810113,
"loss": 3.9804,
"step": 893
},
{
"epoch": 0.1,
"grad_norm": 2.8895820469451077,
"learning_rate": 0.0009865478008565275,
"loss": 4.2113,
"step": 894
},
{
"epoch": 0.1,
"grad_norm": 1.5962174824713355,
"learning_rate": 0.0009865049827929475,
"loss": 4.0425,
"step": 895
},
{
"epoch": 0.1,
"grad_norm": 2.110629365373305,
"learning_rate": 0.0009864620976252785,
"loss": 4.3759,
"step": 896
},
{
"epoch": 0.1,
"grad_norm": 1.6510518771710678,
"learning_rate": 0.000986419145359436,
"loss": 4.1769,
"step": 897
},
{
"epoch": 0.1,
"grad_norm": 1.644861439913146,
"learning_rate": 0.0009863761260013443,
"loss": 4.1194,
"step": 898
},
{
"epoch": 0.1,
"grad_norm": 1.3219321754614737,
"learning_rate": 0.0009863330395569374,
"loss": 4.114,
"step": 899
},
{
"epoch": 0.1,
"grad_norm": 1.721551214848858,
"learning_rate": 0.000986289886032158,
"loss": 4.1789,
"step": 900
},
{
"epoch": 0.1,
"grad_norm": 1.5994473048104454,
"learning_rate": 0.0009862466654329582,
"loss": 4.1776,
"step": 901
},
{
"epoch": 0.1,
"grad_norm": 1.5919192712676673,
"learning_rate": 0.0009862033777652997,
"loss": 4.2,
"step": 902
},
{
"epoch": 0.1,
"grad_norm": 1.6780732129117522,
"learning_rate": 0.000986160023035153,
"loss": 4.1056,
"step": 903
},
{
"epoch": 0.1,
"grad_norm": 1.2632961486485421,
"learning_rate": 0.0009861166012484982,
"loss": 3.9371,
"step": 904
},
{
"epoch": 0.1,
"grad_norm": 2.085402194580041,
"learning_rate": 0.0009860731124113247,
"loss": 3.8418,
"step": 905
},
{
"epoch": 0.1,
"grad_norm": 1.7248485021509143,
"learning_rate": 0.0009860295565296306,
"loss": 4.1345,
"step": 906
},
{
"epoch": 0.1,
"grad_norm": 1.2734828845705393,
"learning_rate": 0.000985985933609424,
"loss": 3.8753,
"step": 907
},
{
"epoch": 0.1,
"grad_norm": 1.6305859669201221,
"learning_rate": 0.0009859422436567212,
"loss": 4.0774,
"step": 908
},
{
"epoch": 0.1,
"grad_norm": 2.631588019075633,
"learning_rate": 0.000985898486677549,
"loss": 4.0552,
"step": 909
},
{
"epoch": 0.1,
"grad_norm": 5.146631533969775,
"learning_rate": 0.0009858546626779425,
"loss": 4.0444,
"step": 910
},
{
"epoch": 0.1,
"grad_norm": 1.4864078775176424,
"learning_rate": 0.0009858107716639464,
"loss": 4.1628,
"step": 911
},
{
"epoch": 0.1,
"grad_norm": 1.255878831726148,
"learning_rate": 0.000985766813641615,
"loss": 3.9995,
"step": 912
},
{
"epoch": 0.1,
"grad_norm": 1.3828357507560953,
"learning_rate": 0.0009857227886170112,
"loss": 3.9688,
"step": 913
},
{
"epoch": 0.1,
"grad_norm": 1.480334348267613,
"learning_rate": 0.0009856786965962074,
"loss": 3.9517,
"step": 914
},
{
"epoch": 0.1,
"grad_norm": 2.186957690755025,
"learning_rate": 0.0009856345375852853,
"loss": 4.1979,
"step": 915
},
{
"epoch": 0.11,
"grad_norm": 2.380674836048694,
"learning_rate": 0.0009855903115903357,
"loss": 4.1367,
"step": 916
},
{
"epoch": 0.11,
"grad_norm": 1.6111821549940448,
"learning_rate": 0.0009855460186174588,
"loss": 4.2874,
"step": 917
},
{
"epoch": 0.11,
"grad_norm": 2.2088280426213163,
"learning_rate": 0.000985501658672764,
"loss": 3.8984,
"step": 918
},
{
"epoch": 0.11,
"grad_norm": 2.1061845949570723,
"learning_rate": 0.0009854572317623698,
"loss": 4.0018,
"step": 919
},
{
"epoch": 0.11,
"grad_norm": 1.5826233210173768,
"learning_rate": 0.0009854127378924043,
"loss": 4.0688,
"step": 920
},
{
"epoch": 0.11,
"grad_norm": 2.319828940222645,
"learning_rate": 0.0009853681770690043,
"loss": 3.9957,
"step": 921
},
{
"epoch": 0.11,
"grad_norm": 2.639522521806413,
"learning_rate": 0.0009853235492983164,
"loss": 3.9084,
"step": 922
},
{
"epoch": 0.11,
"grad_norm": 1.2057356050329093,
"learning_rate": 0.000985278854586496,
"loss": 3.9021,
"step": 923
},
{
"epoch": 0.11,
"grad_norm": 1.415430097369293,
"learning_rate": 0.0009852340929397076,
"loss": 3.931,
"step": 924
},
{
"epoch": 0.11,
"grad_norm": 1.3068607128680707,
"learning_rate": 0.0009851892643641257,
"loss": 3.9395,
"step": 925
},
{
"epoch": 0.11,
"grad_norm": 1.4897963281173685,
"learning_rate": 0.000985144368865933,
"loss": 4.0867,
"step": 926
},
{
"epoch": 0.11,
"grad_norm": 1.2781927711086296,
"learning_rate": 0.0009850994064513226,
"loss": 4.2004,
"step": 927
},
{
"epoch": 0.11,
"grad_norm": 1.320457471366556,
"learning_rate": 0.000985054377126496,
"loss": 4.0297,
"step": 928
},
{
"epoch": 0.11,
"grad_norm": 1.5277842280122598,
"learning_rate": 0.0009850092808976639,
"loss": 4.0596,
"step": 929
},
{
"epoch": 0.11,
"grad_norm": 1.2518662497623179,
"learning_rate": 0.0009849641177710467,
"loss": 4.2665,
"step": 930
},
{
"epoch": 0.11,
"grad_norm": 1.5149067967894774,
"learning_rate": 0.0009849188877528736,
"loss": 4.2191,
"step": 931
},
{
"epoch": 0.11,
"grad_norm": 1.6905230949838688,
"learning_rate": 0.0009848735908493834,
"loss": 4.0743,
"step": 932
},
{
"epoch": 0.11,
"grad_norm": 1.1929701681584368,
"learning_rate": 0.0009848282270668238,
"loss": 4.1644,
"step": 933
},
{
"epoch": 0.11,
"grad_norm": 1.2257841469605364,
"learning_rate": 0.000984782796411452,
"loss": 3.9519,
"step": 934
},
{
"epoch": 0.11,
"grad_norm": 1.176879136105362,
"learning_rate": 0.0009847372988895343,
"loss": 3.9867,
"step": 935
},
{
"epoch": 0.11,
"grad_norm": 1.9992555455613272,
"learning_rate": 0.000984691734507346,
"loss": 4.0503,
"step": 936
},
{
"epoch": 0.11,
"grad_norm": 1.1602851654795971,
"learning_rate": 0.0009846461032711723,
"loss": 3.7602,
"step": 937
},
{
"epoch": 0.11,
"grad_norm": 1.8971186764060644,
"learning_rate": 0.0009846004051873066,
"loss": 4.0302,
"step": 938
},
{
"epoch": 0.11,
"grad_norm": 1.5633693369995434,
"learning_rate": 0.0009845546402620523,
"loss": 4.0715,
"step": 939
},
{
"epoch": 0.11,
"grad_norm": 1.1842397888352805,
"learning_rate": 0.0009845088085017218,
"loss": 4.1694,
"step": 940
},
{
"epoch": 0.11,
"grad_norm": 1.207349419360585,
"learning_rate": 0.000984462909912637,
"loss": 4.1562,
"step": 941
},
{
"epoch": 0.11,
"grad_norm": 3.1162379496479007,
"learning_rate": 0.0009844169445011282,
"loss": 4.2172,
"step": 942
},
{
"epoch": 0.11,
"grad_norm": 1.2323412620635488,
"learning_rate": 0.0009843709122735358,
"loss": 4.0566,
"step": 943
},
{
"epoch": 0.11,
"grad_norm": 1.422142142553048,
"learning_rate": 0.000984324813236209,
"loss": 3.9576,
"step": 944
},
{
"epoch": 0.11,
"grad_norm": 1.2023201557705174,
"learning_rate": 0.0009842786473955062,
"loss": 4.0906,
"step": 945
},
{
"epoch": 0.11,
"grad_norm": 1.285804898787631,
"learning_rate": 0.0009842324147577954,
"loss": 3.8996,
"step": 946
},
{
"epoch": 0.11,
"grad_norm": 1.3110230140133707,
"learning_rate": 0.0009841861153294534,
"loss": 4.1524,
"step": 947
},
{
"epoch": 0.11,
"grad_norm": 1.8117412704677454,
"learning_rate": 0.000984139749116866,
"loss": 4.1041,
"step": 948
},
{
"epoch": 0.11,
"grad_norm": 1.321714473979289,
"learning_rate": 0.0009840933161264288,
"loss": 4.2487,
"step": 949
},
{
"epoch": 0.11,
"grad_norm": 1.984874592434313,
"learning_rate": 0.0009840468163645462,
"loss": 4.0886,
"step": 950
},
{
"epoch": 0.11,
"grad_norm": 1.3566132759748373,
"learning_rate": 0.0009840002498376322,
"loss": 4.4746,
"step": 951
},
{
"epoch": 0.11,
"grad_norm": 1.317596157846536,
"learning_rate": 0.0009839536165521094,
"loss": 3.9685,
"step": 952
},
{
"epoch": 0.11,
"grad_norm": 2.4264478553663755,
"learning_rate": 0.0009839069165144103,
"loss": 4.1508,
"step": 953
},
{
"epoch": 0.11,
"grad_norm": 1.4048612046314342,
"learning_rate": 0.0009838601497309763,
"loss": 3.8894,
"step": 954
},
{
"epoch": 0.11,
"grad_norm": 4.379628338406131,
"learning_rate": 0.0009838133162082578,
"loss": 3.9963,
"step": 955
},
{
"epoch": 0.11,
"grad_norm": 1.3055416549723382,
"learning_rate": 0.0009837664159527146,
"loss": 4.0372,
"step": 956
},
{
"epoch": 0.11,
"grad_norm": 1.446338268413068,
"learning_rate": 0.0009837194489708157,
"loss": 4.1787,
"step": 957
},
{
"epoch": 0.11,
"grad_norm": 1.1405526985096115,
"learning_rate": 0.0009836724152690395,
"loss": 4.1131,
"step": 958
},
{
"epoch": 0.11,
"grad_norm": 1.2692440128786606,
"learning_rate": 0.0009836253148538731,
"loss": 4.0634,
"step": 959
},
{
"epoch": 0.11,
"grad_norm": 2.9207788332963034,
"learning_rate": 0.0009835781477318133,
"loss": 4.1055,
"step": 960
},
{
"epoch": 0.11,
"grad_norm": 1.7000128588792343,
"learning_rate": 0.000983530913909366,
"loss": 4.1679,
"step": 961
},
{
"epoch": 0.11,
"grad_norm": 1.2184849719349153,
"learning_rate": 0.0009834836133930458,
"loss": 4.0771,
"step": 962
},
{
"epoch": 0.11,
"grad_norm": 1.1866419847927028,
"learning_rate": 0.0009834362461893773,
"loss": 4.0561,
"step": 963
},
{
"epoch": 0.11,
"grad_norm": 1.1816064035871716,
"learning_rate": 0.0009833888123048937,
"loss": 4.0231,
"step": 964
},
{
"epoch": 0.11,
"grad_norm": 6.652066844511286,
"learning_rate": 0.0009833413117461378,
"loss": 4.0028,
"step": 965
},
{
"epoch": 0.11,
"grad_norm": 1.4053628013176662,
"learning_rate": 0.0009832937445196613,
"loss": 3.908,
"step": 966
},
{
"epoch": 0.11,
"grad_norm": 6.141565525936523,
"learning_rate": 0.000983246110632025,
"loss": 4.275,
"step": 967
},
{
"epoch": 0.11,
"grad_norm": 1.4278201820569127,
"learning_rate": 0.0009831984100897994,
"loss": 4.0542,
"step": 968
},
{
"epoch": 0.11,
"grad_norm": 1.199384260694025,
"learning_rate": 0.0009831506428995636,
"loss": 4.1086,
"step": 969
},
{
"epoch": 0.11,
"grad_norm": 1.2346590703393536,
"learning_rate": 0.0009831028090679064,
"loss": 3.9633,
"step": 970
},
{
"epoch": 0.11,
"grad_norm": 1.3517228515130117,
"learning_rate": 0.0009830549086014254,
"loss": 3.9808,
"step": 971
},
{
"epoch": 0.11,
"grad_norm": 1.926508834080503,
"learning_rate": 0.0009830069415067276,
"loss": 3.8177,
"step": 972
},
{
"epoch": 0.11,
"grad_norm": 4.397354890876198,
"learning_rate": 0.0009829589077904293,
"loss": 3.995,
"step": 973
},
{
"epoch": 0.11,
"grad_norm": 2.5506131629600852,
"learning_rate": 0.0009829108074591556,
"loss": 3.9053,
"step": 974
},
{
"epoch": 0.11,
"grad_norm": 1.4924478582790404,
"learning_rate": 0.0009828626405195412,
"loss": 3.6817,
"step": 975
},
{
"epoch": 0.11,
"grad_norm": 1.287765183688655,
"learning_rate": 0.0009828144069782296,
"loss": 4.0913,
"step": 976
},
{
"epoch": 0.11,
"grad_norm": 1.816691401337574,
"learning_rate": 0.0009827661068418738,
"loss": 4.1,
"step": 977
},
{
"epoch": 0.11,
"grad_norm": 1.1693005548358926,
"learning_rate": 0.0009827177401171361,
"loss": 4.0689,
"step": 978
},
{
"epoch": 0.11,
"grad_norm": 1.1144680941717817,
"learning_rate": 0.0009826693068106876,
"loss": 4.0424,
"step": 979
},
{
"epoch": 0.11,
"grad_norm": 1.2934458002590496,
"learning_rate": 0.0009826208069292086,
"loss": 4.0227,
"step": 980
},
{
"epoch": 0.11,
"grad_norm": 1.4835842495086131,
"learning_rate": 0.000982572240479389,
"loss": 4.0982,
"step": 981
},
{
"epoch": 0.11,
"grad_norm": 1.271483482139452,
"learning_rate": 0.0009825236074679274,
"loss": 4.0279,
"step": 982
},
{
"epoch": 0.11,
"grad_norm": 1.7214152289466564,
"learning_rate": 0.0009824749079015318,
"loss": 4.1749,
"step": 983
},
{
"epoch": 0.11,
"grad_norm": 1.5790129001826994,
"learning_rate": 0.0009824261417869197,
"loss": 3.902,
"step": 984
},
{
"epoch": 0.11,
"grad_norm": 1.2710077031760036,
"learning_rate": 0.000982377309130817,
"loss": 4.1015,
"step": 985
},
{
"epoch": 0.11,
"grad_norm": 1.079904617272973,
"learning_rate": 0.0009823284099399596,
"loss": 4.1324,
"step": 986
},
{
"epoch": 0.11,
"grad_norm": 1.3034347301445846,
"learning_rate": 0.000982279444221092,
"loss": 4.1291,
"step": 987
},
{
"epoch": 0.11,
"grad_norm": 1.2092294885850745,
"learning_rate": 0.0009822304119809682,
"loss": 4.0041,
"step": 988
},
{
"epoch": 0.11,
"grad_norm": 1.2730109767279272,
"learning_rate": 0.0009821813132263513,
"loss": 3.9237,
"step": 989
},
{
"epoch": 0.11,
"grad_norm": 1.5581751315706167,
"learning_rate": 0.0009821321479640134,
"loss": 4.2097,
"step": 990
},
{
"epoch": 0.11,
"grad_norm": 1.6129165168717297,
"learning_rate": 0.0009820829162007357,
"loss": 3.9833,
"step": 991
},
{
"epoch": 0.11,
"grad_norm": 1.2873737278633184,
"learning_rate": 0.0009820336179433091,
"loss": 4.171,
"step": 992
},
{
"epoch": 0.11,
"grad_norm": 1.4312967003286392,
"learning_rate": 0.0009819842531985337,
"loss": 4.0234,
"step": 993
},
{
"epoch": 0.11,
"grad_norm": 1.6915493531514212,
"learning_rate": 0.0009819348219732176,
"loss": 4.0339,
"step": 994
},
{
"epoch": 0.11,
"grad_norm": 1.5336906521936657,
"learning_rate": 0.0009818853242741796,
"loss": 4.0189,
"step": 995
},
{
"epoch": 0.11,
"grad_norm": 1.3149077305768606,
"learning_rate": 0.0009818357601082467,
"loss": 3.9543,
"step": 996
},
{
"epoch": 0.11,
"grad_norm": 1.5716224194463693,
"learning_rate": 0.0009817861294822551,
"loss": 4.0773,
"step": 997
},
{
"epoch": 0.11,
"grad_norm": 1.40519710985977,
"learning_rate": 0.0009817364324030506,
"loss": 3.9557,
"step": 998
},
{
"epoch": 0.11,
"grad_norm": 1.01794971224117,
"learning_rate": 0.0009816866688774882,
"loss": 4.1058,
"step": 999
},
{
"epoch": 0.11,
"grad_norm": 4.860652808748809,
"learning_rate": 0.0009816368389124314,
"loss": 4.2831,
"step": 1000
},
{
"epoch": 0.11,
"grad_norm": 4.402129028043682,
"learning_rate": 0.0009815869425147537,
"loss": 4.0437,
"step": 1001
},
{
"epoch": 0.11,
"grad_norm": 1.4823211432674894,
"learning_rate": 0.0009815369796913373,
"loss": 4.2357,
"step": 1002
},
{
"epoch": 0.12,
"grad_norm": 1.4790538387348315,
"learning_rate": 0.0009814869504490731,
"loss": 4.0382,
"step": 1003
},
{
"epoch": 0.12,
"grad_norm": 1.2730933337418775,
"learning_rate": 0.0009814368547948623,
"loss": 4.1144,
"step": 1004
},
{
"epoch": 0.12,
"grad_norm": 1.1489404794798845,
"learning_rate": 0.0009813866927356142,
"loss": 4.206,
"step": 1005
},
{
"epoch": 0.12,
"grad_norm": 2.718423900008804,
"learning_rate": 0.000981336464278248,
"loss": 4.0481,
"step": 1006
},
{
"epoch": 0.12,
"grad_norm": 1.9794000243327088,
"learning_rate": 0.0009812861694296917,
"loss": 4.195,
"step": 1007
},
{
"epoch": 0.12,
"grad_norm": 1.047499317782648,
"learning_rate": 0.0009812358081968825,
"loss": 3.8762,
"step": 1008
},
{
"epoch": 0.12,
"grad_norm": 1.312670767675802,
"learning_rate": 0.0009811853805867668,
"loss": 3.8914,
"step": 1009
},
{
"epoch": 0.12,
"grad_norm": 1.2490019835768826,
"learning_rate": 0.0009811348866063,
"loss": 4.0617,
"step": 1010
},
{
"epoch": 0.12,
"grad_norm": 1.4056700523809305,
"learning_rate": 0.0009810843262624467,
"loss": 4.3159,
"step": 1011
},
{
"epoch": 0.12,
"grad_norm": 1.6390340828956516,
"learning_rate": 0.000981033699562181,
"loss": 4.2921,
"step": 1012
},
{
"epoch": 0.12,
"grad_norm": 1.162608838536548,
"learning_rate": 0.0009809830065124858,
"loss": 4.1538,
"step": 1013
},
{
"epoch": 0.12,
"grad_norm": 1.6026138425946854,
"learning_rate": 0.0009809322471203534,
"loss": 4.0057,
"step": 1014
},
{
"epoch": 0.12,
"grad_norm": 1.7498607343990364,
"learning_rate": 0.0009808814213927847,
"loss": 4.1682,
"step": 1015
},
{
"epoch": 0.12,
"grad_norm": 1.3922144822159193,
"learning_rate": 0.0009808305293367904,
"loss": 4.1587,
"step": 1016
},
{
"epoch": 0.12,
"grad_norm": 26.277603721175055,
"learning_rate": 0.00098077957095939,
"loss": 4.0795,
"step": 1017
},
{
"epoch": 0.12,
"grad_norm": 3.8112385727503764,
"learning_rate": 0.0009807285462676122,
"loss": 4.066,
"step": 1018
},
{
"epoch": 0.12,
"grad_norm": 1.8959416289394118,
"learning_rate": 0.0009806774552684953,
"loss": 4.0906,
"step": 1019
},
{
"epoch": 0.12,
"grad_norm": 1.622234905733966,
"learning_rate": 0.0009806262979690857,
"loss": 3.919,
"step": 1020
},
{
"epoch": 0.12,
"grad_norm": 1.4925078199962707,
"learning_rate": 0.00098057507437644,
"loss": 3.8106,
"step": 1021
},
{
"epoch": 0.12,
"grad_norm": 1.4575498548230545,
"learning_rate": 0.0009805237844976234,
"loss": 4.0375,
"step": 1022
},
{
"epoch": 0.12,
"grad_norm": 3.7165346716241943,
"learning_rate": 0.00098047242833971,
"loss": 4.1195,
"step": 1023
},
{
"epoch": 0.12,
"grad_norm": 1.390624756790253,
"learning_rate": 0.0009804210059097841,
"loss": 4.0582,
"step": 1024
},
{
"epoch": 0.12,
"grad_norm": 1.627757532921064,
"learning_rate": 0.0009803695172149382,
"loss": 4.0557,
"step": 1025
},
{
"epoch": 0.12,
"grad_norm": 6.181473916727016,
"learning_rate": 0.0009803179622622738,
"loss": 3.9394,
"step": 1026
},
{
"epoch": 0.12,
"grad_norm": 1.9392090615767972,
"learning_rate": 0.0009802663410589023,
"loss": 4.2315,
"step": 1027
},
{
"epoch": 0.12,
"grad_norm": 1.3179995506591398,
"learning_rate": 0.0009802146536119437,
"loss": 3.9797,
"step": 1028
},
{
"epoch": 0.12,
"grad_norm": 1.45031357075047,
"learning_rate": 0.0009801628999285274,
"loss": 3.975,
"step": 1029
},
{
"epoch": 0.12,
"grad_norm": 5.921840222943073,
"learning_rate": 0.000980111080015792,
"loss": 4.1748,
"step": 1030
},
{
"epoch": 0.12,
"grad_norm": 1.2850071444042936,
"learning_rate": 0.0009800591938808846,
"loss": 3.877,
"step": 1031
},
{
"epoch": 0.12,
"grad_norm": 1.2793305392606493,
"learning_rate": 0.0009800072415309623,
"loss": 4.2125,
"step": 1032
},
{
"epoch": 0.12,
"grad_norm": 2.1523367727147753,
"learning_rate": 0.0009799552229731907,
"loss": 3.8641,
"step": 1033
},
{
"epoch": 0.12,
"grad_norm": 4.354578693520637,
"learning_rate": 0.0009799031382147448,
"loss": 4.1685,
"step": 1034
},
{
"epoch": 0.12,
"grad_norm": 3.4434103059761116,
"learning_rate": 0.000979850987262809,
"loss": 4.0627,
"step": 1035
},
{
"epoch": 0.12,
"grad_norm": 0.7087287464153761,
"learning_rate": 0.0009797987701245761,
"loss": 3.9373,
"step": 1036
},
{
"epoch": 0.12,
"grad_norm": 1.4322652783547312,
"learning_rate": 0.0009797464868072487,
"loss": 4.1694,
"step": 1037
},
{
"epoch": 0.12,
"grad_norm": 1.258691315160102,
"learning_rate": 0.0009796941373180384,
"loss": 4.2494,
"step": 1038
},
{
"epoch": 0.12,
"grad_norm": 1.3545471197187142,
"learning_rate": 0.0009796417216641653,
"loss": 4.1533,
"step": 1039
},
{
"epoch": 0.12,
"grad_norm": 1.0610387625266209,
"learning_rate": 0.00097958923985286,
"loss": 4.0507,
"step": 1040
},
{
"epoch": 0.12,
"grad_norm": 1.1441854336507373,
"learning_rate": 0.0009795366918913604,
"loss": 4.1602,
"step": 1041
},
{
"epoch": 0.12,
"grad_norm": 1.683186222995479,
"learning_rate": 0.0009794840777869152,
"loss": 4.1068,
"step": 1042
},
{
"epoch": 0.12,
"grad_norm": 1.3010141741874532,
"learning_rate": 0.0009794313975467813,
"loss": 4.3194,
"step": 1043
},
{
"epoch": 0.12,
"grad_norm": 1.401158710932506,
"learning_rate": 0.0009793786511782248,
"loss": 3.9999,
"step": 1044
},
{
"epoch": 0.12,
"grad_norm": 1.2228105604228117,
"learning_rate": 0.000979325838688521,
"loss": 4.0648,
"step": 1045
},
{
"epoch": 0.12,
"grad_norm": 1.3001665446288744,
"learning_rate": 0.000979272960084955,
"loss": 4.1342,
"step": 1046
},
{
"epoch": 0.12,
"grad_norm": 1.923309509834412,
"learning_rate": 0.0009792200153748195,
"loss": 4.0739,
"step": 1047
},
{
"epoch": 0.12,
"grad_norm": 3.6890518840189803,
"learning_rate": 0.0009791670045654177,
"loss": 4.0867,
"step": 1048
},
{
"epoch": 0.12,
"grad_norm": 1.5598404960921566,
"learning_rate": 0.0009791139276640614,
"loss": 3.8474,
"step": 1049
},
{
"epoch": 0.12,
"grad_norm": 1.747740107737911,
"learning_rate": 0.0009790607846780718,
"loss": 4.087,
"step": 1050
},
{
"epoch": 0.12,
"grad_norm": 3.5324063230961253,
"learning_rate": 0.0009790075756147783,
"loss": 4.0739,
"step": 1051
},
{
"epoch": 0.12,
"grad_norm": 2.171647229274092,
"learning_rate": 0.0009789543004815207,
"loss": 3.9563,
"step": 1052
},
{
"epoch": 0.12,
"grad_norm": 1.4985059033678876,
"learning_rate": 0.000978900959285647,
"loss": 4.2187,
"step": 1053
},
{
"epoch": 0.12,
"grad_norm": 6.064379861767049,
"learning_rate": 0.0009788475520345146,
"loss": 4.3221,
"step": 1054
},
{
"epoch": 0.12,
"grad_norm": 1.4971275840059683,
"learning_rate": 0.0009787940787354902,
"loss": 4.069,
"step": 1055
},
{
"epoch": 0.12,
"grad_norm": 2.263977554472576,
"learning_rate": 0.000978740539395949,
"loss": 4.4513,
"step": 1056
},
{
"epoch": 0.12,
"grad_norm": 1.4908730806011838,
"learning_rate": 0.0009786869340232761,
"loss": 4.3548,
"step": 1057
},
{
"epoch": 0.12,
"grad_norm": 1.4719282241238945,
"learning_rate": 0.0009786332626248655,
"loss": 3.9538,
"step": 1058
},
{
"epoch": 0.12,
"grad_norm": 1.340626012032056,
"learning_rate": 0.0009785795252081199,
"loss": 4.0992,
"step": 1059
},
{
"epoch": 0.12,
"grad_norm": 1.306660821928319,
"learning_rate": 0.000978525721780451,
"loss": 4.1287,
"step": 1060
},
{
"epoch": 0.12,
"grad_norm": 1.2300265414566451,
"learning_rate": 0.0009784718523492804,
"loss": 3.9858,
"step": 1061
},
{
"epoch": 0.12,
"grad_norm": 1.1588443166430544,
"learning_rate": 0.0009784179169220384,
"loss": 4.2258,
"step": 1062
},
{
"epoch": 0.12,
"grad_norm": 1.57047185802853,
"learning_rate": 0.0009783639155061643,
"loss": 3.7996,
"step": 1063
},
{
"epoch": 0.12,
"grad_norm": 1.2808171957774321,
"learning_rate": 0.0009783098481091063,
"loss": 4.0964,
"step": 1064
},
{
"epoch": 0.12,
"grad_norm": 1.3042816824374917,
"learning_rate": 0.0009782557147383225,
"loss": 3.9128,
"step": 1065
},
{
"epoch": 0.12,
"grad_norm": 1.1420537559137431,
"learning_rate": 0.0009782015154012789,
"loss": 4.1346,
"step": 1066
},
{
"epoch": 0.12,
"grad_norm": 1.327327290934473,
"learning_rate": 0.0009781472501054517,
"loss": 3.9409,
"step": 1067
},
{
"epoch": 0.12,
"grad_norm": 3.05505262983701,
"learning_rate": 0.0009780929188583256,
"loss": 4.0558,
"step": 1068
},
{
"epoch": 0.12,
"grad_norm": 1.1374303636099061,
"learning_rate": 0.000978038521667395,
"loss": 4.0334,
"step": 1069
},
{
"epoch": 0.12,
"grad_norm": 1.2314833517094605,
"learning_rate": 0.000977984058540162,
"loss": 4.0432,
"step": 1070
},
{
"epoch": 0.12,
"grad_norm": 1.103150152733436,
"learning_rate": 0.0009779295294841397,
"loss": 3.9199,
"step": 1071
},
{
"epoch": 0.12,
"grad_norm": 1.3462765793506142,
"learning_rate": 0.0009778749345068487,
"loss": 4.0583,
"step": 1072
},
{
"epoch": 0.12,
"grad_norm": 1.739819561788374,
"learning_rate": 0.00097782027361582,
"loss": 4.0955,
"step": 1073
},
{
"epoch": 0.12,
"grad_norm": 1.2288585735959598,
"learning_rate": 0.0009777655468185924,
"loss": 3.9591,
"step": 1074
},
{
"epoch": 0.12,
"grad_norm": 1.5945075402894981,
"learning_rate": 0.0009777107541227147,
"loss": 4.142,
"step": 1075
},
{
"epoch": 0.12,
"grad_norm": 2.389277968946144,
"learning_rate": 0.0009776558955357443,
"loss": 4.0779,
"step": 1076
},
{
"epoch": 0.12,
"grad_norm": 1.4375073059031855,
"learning_rate": 0.0009776009710652483,
"loss": 3.9904,
"step": 1077
},
{
"epoch": 0.12,
"grad_norm": 1.4348995213661113,
"learning_rate": 0.0009775459807188022,
"loss": 3.9242,
"step": 1078
},
{
"epoch": 0.12,
"grad_norm": 1.5934869452469487,
"learning_rate": 0.0009774909245039909,
"loss": 3.8806,
"step": 1079
},
{
"epoch": 0.12,
"grad_norm": 1.479845608471666,
"learning_rate": 0.0009774358024284082,
"loss": 4.0791,
"step": 1080
},
{
"epoch": 0.12,
"grad_norm": 1.117296021687194,
"learning_rate": 0.0009773806144996575,
"loss": 3.9955,
"step": 1081
},
{
"epoch": 0.12,
"grad_norm": 2.183772677672718,
"learning_rate": 0.0009773253607253507,
"loss": 3.9873,
"step": 1082
},
{
"epoch": 0.12,
"grad_norm": 1.5789962639928703,
"learning_rate": 0.000977270041113109,
"loss": 3.9276,
"step": 1083
},
{
"epoch": 0.12,
"grad_norm": 1.0193743656006615,
"learning_rate": 0.0009772146556705629,
"loss": 3.9963,
"step": 1084
},
{
"epoch": 0.12,
"grad_norm": 2.2295991421472445,
"learning_rate": 0.0009771592044053512,
"loss": 4.1109,
"step": 1085
},
{
"epoch": 0.12,
"grad_norm": 2.9581005964807026,
"learning_rate": 0.000977103687325123,
"loss": 4.2778,
"step": 1086
},
{
"epoch": 0.12,
"grad_norm": 1.2850283541503715,
"learning_rate": 0.0009770481044375356,
"loss": 3.7693,
"step": 1087
},
{
"epoch": 0.12,
"grad_norm": 1.1692506007914478,
"learning_rate": 0.0009769924557502553,
"loss": 4.0464,
"step": 1088
},
{
"epoch": 0.12,
"grad_norm": 1.360475215456834,
"learning_rate": 0.0009769367412709585,
"loss": 4.0362,
"step": 1089
},
{
"epoch": 0.12,
"grad_norm": 1.493610507999107,
"learning_rate": 0.0009768809610073291,
"loss": 3.9251,
"step": 1090
},
{
"epoch": 0.13,
"grad_norm": 2.0423129596254905,
"learning_rate": 0.0009768251149670614,
"loss": 4.0073,
"step": 1091
},
{
"epoch": 0.13,
"grad_norm": 1.126386563454413,
"learning_rate": 0.000976769203157858,
"loss": 4.0113,
"step": 1092
},
{
"epoch": 0.13,
"grad_norm": 1.6427277520739216,
"learning_rate": 0.0009767132255874315,
"loss": 4.0531,
"step": 1093
},
{
"epoch": 0.13,
"grad_norm": 1.4804944398674211,
"learning_rate": 0.0009766571822635022,
"loss": 3.9697,
"step": 1094
},
{
"epoch": 0.13,
"grad_norm": 1.4291578612082383,
"learning_rate": 0.0009766010731938007,
"loss": 4.094,
"step": 1095
},
{
"epoch": 0.13,
"grad_norm": 1.5141542606663225,
"learning_rate": 0.0009765448983860658,
"loss": 3.9599,
"step": 1096
},
{
"epoch": 0.13,
"grad_norm": 1.8248712042767385,
"learning_rate": 0.0009764886578480461,
"loss": 3.8438,
"step": 1097
},
{
"epoch": 0.13,
"grad_norm": 1.372364023592408,
"learning_rate": 0.0009764323515874986,
"loss": 4.222,
"step": 1098
},
{
"epoch": 0.13,
"grad_norm": 1.231832143212094,
"learning_rate": 0.00097637597961219,
"loss": 4.1938,
"step": 1099
},
{
"epoch": 0.13,
"grad_norm": 1.4374091428190032,
"learning_rate": 0.0009763195419298955,
"loss": 4.0506,
"step": 1100
},
{
"epoch": 0.13,
"grad_norm": 3.022287851742623,
"learning_rate": 0.0009762630385483997,
"loss": 4.0225,
"step": 1101
},
{
"epoch": 0.13,
"grad_norm": 1.3815036560974403,
"learning_rate": 0.000976206469475496,
"loss": 3.9241,
"step": 1102
},
{
"epoch": 0.13,
"grad_norm": 1.5642226838309352,
"learning_rate": 0.0009761498347189872,
"loss": 4.1569,
"step": 1103
},
{
"epoch": 0.13,
"grad_norm": 1.8100534976268143,
"learning_rate": 0.000976093134286685,
"loss": 4.1148,
"step": 1104
},
{
"epoch": 0.13,
"grad_norm": 1.1517123086389929,
"learning_rate": 0.0009760363681864102,
"loss": 4.0619,
"step": 1105
},
{
"epoch": 0.13,
"grad_norm": 1.612100174452912,
"learning_rate": 0.0009759795364259923,
"loss": 3.9537,
"step": 1106
},
{
"epoch": 0.13,
"grad_norm": 1.208072970379895,
"learning_rate": 0.0009759226390132704,
"loss": 4.0334,
"step": 1107
},
{
"epoch": 0.13,
"grad_norm": 1.3887279501098326,
"learning_rate": 0.0009758656759560923,
"loss": 3.9506,
"step": 1108
},
{
"epoch": 0.13,
"grad_norm": 1.3908901557035898,
"learning_rate": 0.0009758086472623151,
"loss": 3.9903,
"step": 1109
},
{
"epoch": 0.13,
"grad_norm": 5.983151859479315,
"learning_rate": 0.0009757515529398047,
"loss": 3.9413,
"step": 1110
},
{
"epoch": 0.13,
"grad_norm": 1.2815439165164777,
"learning_rate": 0.0009756943929964363,
"loss": 3.9637,
"step": 1111
},
{
"epoch": 0.13,
"grad_norm": 1.3602353462716779,
"learning_rate": 0.0009756371674400939,
"loss": 3.957,
"step": 1112
},
{
"epoch": 0.13,
"grad_norm": 1.3029872792102541,
"learning_rate": 0.0009755798762786707,
"loss": 4.0644,
"step": 1113
},
{
"epoch": 0.13,
"grad_norm": 1.1470586218455885,
"learning_rate": 0.0009755225195200689,
"loss": 3.7977,
"step": 1114
},
{
"epoch": 0.13,
"grad_norm": 2.3476550444761335,
"learning_rate": 0.0009754650971722,
"loss": 4.1035,
"step": 1115
},
{
"epoch": 0.13,
"grad_norm": 1.4704414796415541,
"learning_rate": 0.000975407609242984,
"loss": 4.0167,
"step": 1116
},
{
"epoch": 0.13,
"grad_norm": 3.768555022379108,
"learning_rate": 0.0009753500557403504,
"loss": 4.0987,
"step": 1117
},
{
"epoch": 0.13,
"grad_norm": 1.7362033927045515,
"learning_rate": 0.0009752924366722376,
"loss": 3.9892,
"step": 1118
},
{
"epoch": 0.13,
"grad_norm": 1.2465462081912857,
"learning_rate": 0.0009752347520465931,
"loss": 4.1808,
"step": 1119
},
{
"epoch": 0.13,
"grad_norm": 2.365849090450502,
"learning_rate": 0.0009751770018713734,
"loss": 4.0288,
"step": 1120
},
{
"epoch": 0.13,
"grad_norm": 1.2217654375116476,
"learning_rate": 0.0009751191861545439,
"loss": 4.0525,
"step": 1121
},
{
"epoch": 0.13,
"grad_norm": 4.082356606142538,
"learning_rate": 0.0009750613049040792,
"loss": 4.2771,
"step": 1122
},
{
"epoch": 0.13,
"grad_norm": 2.215685880262868,
"learning_rate": 0.0009750033581279632,
"loss": 3.9067,
"step": 1123
},
{
"epoch": 0.13,
"grad_norm": 1.9365207432276945,
"learning_rate": 0.0009749453458341882,
"loss": 4.2362,
"step": 1124
},
{
"epoch": 0.13,
"grad_norm": 1.243393173952921,
"learning_rate": 0.000974887268030756,
"loss": 4.1093,
"step": 1125
},
{
"epoch": 0.13,
"grad_norm": 1.113055690560134,
"learning_rate": 0.0009748291247256774,
"loss": 4.0303,
"step": 1126
},
{
"epoch": 0.13,
"grad_norm": 1.2195322565632427,
"learning_rate": 0.000974770915926972,
"loss": 4.2279,
"step": 1127
},
{
"epoch": 0.13,
"grad_norm": 1.2534133818251345,
"learning_rate": 0.0009747126416426688,
"loss": 4.1046,
"step": 1128
},
{
"epoch": 0.13,
"grad_norm": 2.2762379100531556,
"learning_rate": 0.0009746543018808057,
"loss": 3.9392,
"step": 1129
},
{
"epoch": 0.13,
"grad_norm": 1.32019351523326,
"learning_rate": 0.000974595896649429,
"loss": 4.0348,
"step": 1130
},
{
"epoch": 0.13,
"grad_norm": 1.2715962000111387,
"learning_rate": 0.0009745374259565953,
"loss": 3.9784,
"step": 1131
},
{
"epoch": 0.13,
"grad_norm": 1.037934731168759,
"learning_rate": 0.0009744788898103691,
"loss": 3.7759,
"step": 1132
},
{
"epoch": 0.13,
"grad_norm": 1.1862931088699729,
"learning_rate": 0.0009744202882188245,
"loss": 3.9936,
"step": 1133
},
{
"epoch": 0.13,
"grad_norm": 1.5470478998787935,
"learning_rate": 0.0009743616211900443,
"loss": 4.3491,
"step": 1134
},
{
"epoch": 0.13,
"grad_norm": 1.7282053763887202,
"learning_rate": 0.0009743028887321206,
"loss": 4.2604,
"step": 1135
},
{
"epoch": 0.13,
"grad_norm": 2.04449528113819,
"learning_rate": 0.0009742440908531545,
"loss": 3.8811,
"step": 1136
},
{
"epoch": 0.13,
"grad_norm": 1.240227069887739,
"learning_rate": 0.0009741852275612559,
"loss": 4.0271,
"step": 1137
},
{
"epoch": 0.13,
"grad_norm": 1.1097643794007848,
"learning_rate": 0.0009741262988645441,
"loss": 3.6696,
"step": 1138
},
{
"epoch": 0.13,
"grad_norm": 1.2398726318213478,
"learning_rate": 0.000974067304771147,
"loss": 4.2604,
"step": 1139
},
{
"epoch": 0.13,
"grad_norm": 1.0346633290848877,
"learning_rate": 0.0009740082452892017,
"loss": 3.9715,
"step": 1140
},
{
"epoch": 0.13,
"grad_norm": 1.1932297566003178,
"learning_rate": 0.0009739491204268545,
"loss": 3.9658,
"step": 1141
},
{
"epoch": 0.13,
"grad_norm": 1.205304594451489,
"learning_rate": 0.0009738899301922602,
"loss": 3.98,
"step": 1142
},
{
"epoch": 0.13,
"grad_norm": 1.2020454920984538,
"learning_rate": 0.0009738306745935833,
"loss": 3.7182,
"step": 1143
},
{
"epoch": 0.13,
"grad_norm": 1.260432300070686,
"learning_rate": 0.0009737713536389969,
"loss": 4.2419,
"step": 1144
},
{
"epoch": 0.13,
"grad_norm": 1.1133372525884535,
"learning_rate": 0.0009737119673366832,
"loss": 4.0668,
"step": 1145
},
{
"epoch": 0.13,
"grad_norm": 1.4353979020163097,
"learning_rate": 0.0009736525156948333,
"loss": 4.1236,
"step": 1146
},
{
"epoch": 0.13,
"grad_norm": 3.232836220065639,
"learning_rate": 0.0009735929987216476,
"loss": 3.9061,
"step": 1147
},
{
"epoch": 0.13,
"grad_norm": 1.5406095573245218,
"learning_rate": 0.0009735334164253351,
"loss": 4.0218,
"step": 1148
},
{
"epoch": 0.13,
"grad_norm": 1.9271397077839116,
"learning_rate": 0.0009734737688141142,
"loss": 4.0121,
"step": 1149
},
{
"epoch": 0.13,
"grad_norm": 1.8965940181337684,
"learning_rate": 0.0009734140558962123,
"loss": 3.9482,
"step": 1150
},
{
"epoch": 0.13,
"grad_norm": 2.617457047223621,
"learning_rate": 0.0009733542776798653,
"loss": 4.0872,
"step": 1151
},
{
"epoch": 0.13,
"grad_norm": 1.3860655864056333,
"learning_rate": 0.0009732944341733188,
"loss": 4.042,
"step": 1152
},
{
"epoch": 0.13,
"grad_norm": 1.747993380220601,
"learning_rate": 0.0009732345253848267,
"loss": 4.0696,
"step": 1153
},
{
"epoch": 0.13,
"grad_norm": 1.207376581682096,
"learning_rate": 0.0009731745513226526,
"loss": 3.9544,
"step": 1154
},
{
"epoch": 0.13,
"grad_norm": 1.2256604213238373,
"learning_rate": 0.0009731145119950686,
"loss": 4.0532,
"step": 1155
},
{
"epoch": 0.13,
"grad_norm": 1.0817362493268414,
"learning_rate": 0.0009730544074103562,
"loss": 4.1034,
"step": 1156
},
{
"epoch": 0.13,
"grad_norm": 1.1600626371391756,
"learning_rate": 0.0009729942375768055,
"loss": 4.191,
"step": 1157
},
{
"epoch": 0.13,
"grad_norm": 1.70580656369459,
"learning_rate": 0.0009729340025027158,
"loss": 4.1685,
"step": 1158
},
{
"epoch": 0.13,
"grad_norm": 3.0876482048800766,
"learning_rate": 0.0009728737021963954,
"loss": 4.1988,
"step": 1159
},
{
"epoch": 0.13,
"grad_norm": 1.046961117495001,
"learning_rate": 0.0009728133366661615,
"loss": 3.8947,
"step": 1160
},
{
"epoch": 0.13,
"grad_norm": 1.2109004698355041,
"learning_rate": 0.0009727529059203406,
"loss": 3.9896,
"step": 1161
},
{
"epoch": 0.13,
"grad_norm": 1.7573711158460956,
"learning_rate": 0.0009726924099672676,
"loss": 3.8891,
"step": 1162
},
{
"epoch": 0.13,
"grad_norm": 1.1794932763368144,
"learning_rate": 0.0009726318488152872,
"loss": 3.9238,
"step": 1163
},
{
"epoch": 0.13,
"grad_norm": 1.395301234022215,
"learning_rate": 0.0009725712224727523,
"loss": 3.9941,
"step": 1164
},
{
"epoch": 0.13,
"grad_norm": 1.1358170395882634,
"learning_rate": 0.0009725105309480253,
"loss": 3.9771,
"step": 1165
},
{
"epoch": 0.13,
"grad_norm": 1.2158345909415074,
"learning_rate": 0.0009724497742494776,
"loss": 3.9244,
"step": 1166
},
{
"epoch": 0.13,
"grad_norm": 1.2711099423714294,
"learning_rate": 0.000972388952385489,
"loss": 4.0551,
"step": 1167
},
{
"epoch": 0.13,
"grad_norm": 1.1051383784413746,
"learning_rate": 0.000972328065364449,
"loss": 3.8987,
"step": 1168
},
{
"epoch": 0.13,
"grad_norm": 1.1468378846132352,
"learning_rate": 0.0009722671131947559,
"loss": 3.819,
"step": 1169
},
{
"epoch": 0.13,
"grad_norm": 1.0203706772290229,
"learning_rate": 0.0009722060958848168,
"loss": 3.7945,
"step": 1170
},
{
"epoch": 0.13,
"grad_norm": 1.4775824263762234,
"learning_rate": 0.0009721450134430478,
"loss": 3.883,
"step": 1171
},
{
"epoch": 0.13,
"grad_norm": 1.3536524118876307,
"learning_rate": 0.000972083865877874,
"loss": 4.1824,
"step": 1172
},
{
"epoch": 0.13,
"grad_norm": 3.101880117571411,
"learning_rate": 0.0009720226531977296,
"loss": 4.1951,
"step": 1173
},
{
"epoch": 0.13,
"grad_norm": 1.3680630555380664,
"learning_rate": 0.0009719613754110578,
"loss": 4.136,
"step": 1174
},
{
"epoch": 0.13,
"grad_norm": 1.1442504024431006,
"learning_rate": 0.0009719000325263109,
"loss": 3.9422,
"step": 1175
},
{
"epoch": 0.13,
"grad_norm": 7.083179856231108,
"learning_rate": 0.0009718386245519495,
"loss": 4.1419,
"step": 1176
},
{
"epoch": 0.13,
"grad_norm": 1.1467413861101263,
"learning_rate": 0.0009717771514964439,
"loss": 3.9311,
"step": 1177
},
{
"epoch": 0.14,
"grad_norm": 4.848618204270367,
"learning_rate": 0.0009717156133682734,
"loss": 3.9715,
"step": 1178
},
{
"epoch": 0.14,
"grad_norm": 2.3302178796004167,
"learning_rate": 0.0009716540101759255,
"loss": 4.1595,
"step": 1179
},
{
"epoch": 0.14,
"grad_norm": 3.108320054341137,
"learning_rate": 0.0009715923419278976,
"loss": 4.1926,
"step": 1180
}
],
"logging_steps": 1.0,
"max_steps": 8721,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"total_flos": 3861465440256.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}