kimnamssya's picture
Upload folder using huggingface_hub
6a438b4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.976,
"eval_steps": 125,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002,
"grad_norm": 0.06923668831586838,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4175,
"step": 1
},
{
"epoch": 0.002,
"eval_loss": 0.4618559181690216,
"eval_runtime": 137.9356,
"eval_samples_per_second": 4.002,
"eval_steps_per_second": 0.5,
"step": 1
},
{
"epoch": 0.004,
"grad_norm": 0.09036832302808762,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.5159,
"step": 2
},
{
"epoch": 0.006,
"grad_norm": 0.06212183088064194,
"learning_rate": 3e-06,
"loss": 0.3274,
"step": 3
},
{
"epoch": 0.008,
"grad_norm": 0.089068204164505,
"learning_rate": 4.000000000000001e-06,
"loss": 0.5353,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 0.08060520887374878,
"learning_rate": 5e-06,
"loss": 0.5229,
"step": 5
},
{
"epoch": 0.012,
"grad_norm": 0.08129512518644333,
"learning_rate": 6e-06,
"loss": 0.416,
"step": 6
},
{
"epoch": 0.014,
"grad_norm": 0.13881395757198334,
"learning_rate": 7e-06,
"loss": 0.4797,
"step": 7
},
{
"epoch": 0.016,
"grad_norm": 0.09156442433595657,
"learning_rate": 8.000000000000001e-06,
"loss": 0.4808,
"step": 8
},
{
"epoch": 0.018,
"grad_norm": 0.09145132452249527,
"learning_rate": 9e-06,
"loss": 0.4991,
"step": 9
},
{
"epoch": 0.02,
"grad_norm": 0.08622220903635025,
"learning_rate": 1e-05,
"loss": 0.484,
"step": 10
},
{
"epoch": 0.022,
"grad_norm": 0.07630373537540436,
"learning_rate": 9.999974825027756e-06,
"loss": 0.3951,
"step": 11
},
{
"epoch": 0.024,
"grad_norm": 0.06840338557958603,
"learning_rate": 9.999899300364534e-06,
"loss": 0.4058,
"step": 12
},
{
"epoch": 0.026,
"grad_norm": 0.09991295635700226,
"learning_rate": 9.999773426770864e-06,
"loss": 0.5737,
"step": 13
},
{
"epoch": 0.028,
"grad_norm": 0.09987013041973114,
"learning_rate": 9.999597205514298e-06,
"loss": 0.4535,
"step": 14
},
{
"epoch": 0.03,
"grad_norm": 0.07334341108798981,
"learning_rate": 9.999370638369377e-06,
"loss": 0.4047,
"step": 15
},
{
"epoch": 0.032,
"grad_norm": 0.10504010319709778,
"learning_rate": 9.99909372761763e-06,
"loss": 0.4587,
"step": 16
},
{
"epoch": 0.034,
"grad_norm": 0.12481511384248734,
"learning_rate": 9.998766476047546e-06,
"loss": 0.5568,
"step": 17
},
{
"epoch": 0.036,
"grad_norm": 0.10193619877099991,
"learning_rate": 9.998388886954546e-06,
"loss": 0.58,
"step": 18
},
{
"epoch": 0.038,
"grad_norm": 0.09747433662414551,
"learning_rate": 9.997960964140946e-06,
"loss": 0.4248,
"step": 19
},
{
"epoch": 0.04,
"grad_norm": 0.10985693335533142,
"learning_rate": 9.997482711915926e-06,
"loss": 0.5813,
"step": 20
},
{
"epoch": 0.042,
"grad_norm": 0.08061390370130539,
"learning_rate": 9.99695413509548e-06,
"loss": 0.3419,
"step": 21
},
{
"epoch": 0.044,
"grad_norm": 0.09820478409528732,
"learning_rate": 9.99637523900237e-06,
"loss": 0.336,
"step": 22
},
{
"epoch": 0.046,
"grad_norm": 0.11657540500164032,
"learning_rate": 9.995746029466071e-06,
"loss": 0.4634,
"step": 23
},
{
"epoch": 0.048,
"grad_norm": 0.0904548391699791,
"learning_rate": 9.99506651282272e-06,
"loss": 0.4085,
"step": 24
},
{
"epoch": 0.05,
"grad_norm": 0.1137523204088211,
"learning_rate": 9.994336695915041e-06,
"loss": 0.6002,
"step": 25
},
{
"epoch": 0.052,
"grad_norm": 0.08930382132530212,
"learning_rate": 9.993556586092281e-06,
"loss": 0.4007,
"step": 26
},
{
"epoch": 0.054,
"grad_norm": 0.10268951207399368,
"learning_rate": 9.992726191210139e-06,
"loss": 0.5762,
"step": 27
},
{
"epoch": 0.056,
"grad_norm": 0.11000809073448181,
"learning_rate": 9.991845519630679e-06,
"loss": 0.5878,
"step": 28
},
{
"epoch": 0.058,
"grad_norm": 0.08394967019557953,
"learning_rate": 9.990914580222258e-06,
"loss": 0.4447,
"step": 29
},
{
"epoch": 0.06,
"grad_norm": 0.10849784314632416,
"learning_rate": 9.989933382359423e-06,
"loss": 0.6129,
"step": 30
},
{
"epoch": 0.062,
"grad_norm": 0.09749893844127655,
"learning_rate": 9.988901935922826e-06,
"loss": 0.4993,
"step": 31
},
{
"epoch": 0.064,
"grad_norm": 0.09867393970489502,
"learning_rate": 9.987820251299121e-06,
"loss": 0.4415,
"step": 32
},
{
"epoch": 0.066,
"grad_norm": 0.07566885650157928,
"learning_rate": 9.986688339380863e-06,
"loss": 0.3669,
"step": 33
},
{
"epoch": 0.068,
"grad_norm": 0.08246949315071106,
"learning_rate": 9.985506211566388e-06,
"loss": 0.4102,
"step": 34
},
{
"epoch": 0.07,
"grad_norm": 0.10148797929286957,
"learning_rate": 9.984273879759713e-06,
"loss": 0.5327,
"step": 35
},
{
"epoch": 0.072,
"grad_norm": 0.08779735118150711,
"learning_rate": 9.982991356370404e-06,
"loss": 0.4914,
"step": 36
},
{
"epoch": 0.074,
"grad_norm": 0.09165964275598526,
"learning_rate": 9.981658654313458e-06,
"loss": 0.4136,
"step": 37
},
{
"epoch": 0.076,
"grad_norm": 0.10425784438848495,
"learning_rate": 9.98027578700917e-06,
"loss": 0.6063,
"step": 38
},
{
"epoch": 0.078,
"grad_norm": 0.09124460816383362,
"learning_rate": 9.978842768382999e-06,
"loss": 0.5461,
"step": 39
},
{
"epoch": 0.08,
"grad_norm": 0.0863451436161995,
"learning_rate": 9.977359612865424e-06,
"loss": 0.5108,
"step": 40
},
{
"epoch": 0.082,
"grad_norm": 0.11560487747192383,
"learning_rate": 9.975826335391808e-06,
"loss": 0.4965,
"step": 41
},
{
"epoch": 0.084,
"grad_norm": 0.1319773942232132,
"learning_rate": 9.974242951402236e-06,
"loss": 0.4754,
"step": 42
},
{
"epoch": 0.086,
"grad_norm": 0.08868485689163208,
"learning_rate": 9.972609476841368e-06,
"loss": 0.4958,
"step": 43
},
{
"epoch": 0.088,
"grad_norm": 0.12390384823083878,
"learning_rate": 9.970925928158275e-06,
"loss": 0.5641,
"step": 44
},
{
"epoch": 0.09,
"grad_norm": 0.095445416867733,
"learning_rate": 9.969192322306271e-06,
"loss": 0.5145,
"step": 45
},
{
"epoch": 0.092,
"grad_norm": 0.09656377136707306,
"learning_rate": 9.96740867674275e-06,
"loss": 0.3749,
"step": 46
},
{
"epoch": 0.094,
"grad_norm": 0.07841179519891739,
"learning_rate": 9.965575009429006e-06,
"loss": 0.4113,
"step": 47
},
{
"epoch": 0.096,
"grad_norm": 0.07786890119314194,
"learning_rate": 9.963691338830045e-06,
"loss": 0.4374,
"step": 48
},
{
"epoch": 0.098,
"grad_norm": 0.09050661325454712,
"learning_rate": 9.961757683914406e-06,
"loss": 0.5285,
"step": 49
},
{
"epoch": 0.1,
"grad_norm": 0.11070208251476288,
"learning_rate": 9.959774064153977e-06,
"loss": 0.5326,
"step": 50
},
{
"epoch": 0.102,
"grad_norm": 0.09067952632904053,
"learning_rate": 9.957740499523787e-06,
"loss": 0.5613,
"step": 51
},
{
"epoch": 0.104,
"grad_norm": 0.08883544057607651,
"learning_rate": 9.955657010501807e-06,
"loss": 0.4599,
"step": 52
},
{
"epoch": 0.106,
"grad_norm": 0.10251513868570328,
"learning_rate": 9.95352361806875e-06,
"loss": 0.5354,
"step": 53
},
{
"epoch": 0.108,
"grad_norm": 0.07133735716342926,
"learning_rate": 9.951340343707852e-06,
"loss": 0.3696,
"step": 54
},
{
"epoch": 0.11,
"grad_norm": 0.061642151325941086,
"learning_rate": 9.949107209404664e-06,
"loss": 0.3472,
"step": 55
},
{
"epoch": 0.112,
"grad_norm": 0.08950634300708771,
"learning_rate": 9.946824237646823e-06,
"loss": 0.4969,
"step": 56
},
{
"epoch": 0.114,
"grad_norm": 0.08016358315944672,
"learning_rate": 9.944491451423829e-06,
"loss": 0.5239,
"step": 57
},
{
"epoch": 0.116,
"grad_norm": 0.12512832880020142,
"learning_rate": 9.942108874226812e-06,
"loss": 0.5365,
"step": 58
},
{
"epoch": 0.118,
"grad_norm": 0.09220532327890396,
"learning_rate": 9.9396765300483e-06,
"loss": 0.4783,
"step": 59
},
{
"epoch": 0.12,
"grad_norm": 0.0885612890124321,
"learning_rate": 9.937194443381972e-06,
"loss": 0.5459,
"step": 60
},
{
"epoch": 0.122,
"grad_norm": 0.08592379838228226,
"learning_rate": 9.934662639222412e-06,
"loss": 0.4545,
"step": 61
},
{
"epoch": 0.124,
"grad_norm": 0.08418423682451248,
"learning_rate": 9.93208114306486e-06,
"loss": 0.5105,
"step": 62
},
{
"epoch": 0.126,
"grad_norm": 0.07870952039957047,
"learning_rate": 9.929449980904952e-06,
"loss": 0.4593,
"step": 63
},
{
"epoch": 0.128,
"grad_norm": 0.08841884881258011,
"learning_rate": 9.926769179238467e-06,
"loss": 0.4812,
"step": 64
},
{
"epoch": 0.13,
"grad_norm": 0.07493194192647934,
"learning_rate": 9.924038765061042e-06,
"loss": 0.5065,
"step": 65
},
{
"epoch": 0.132,
"grad_norm": 0.08470446616411209,
"learning_rate": 9.921258765867919e-06,
"loss": 0.4676,
"step": 66
},
{
"epoch": 0.134,
"grad_norm": 0.0656595379114151,
"learning_rate": 9.918429209653662e-06,
"loss": 0.3227,
"step": 67
},
{
"epoch": 0.136,
"grad_norm": 0.06501025706529617,
"learning_rate": 9.915550124911866e-06,
"loss": 0.2777,
"step": 68
},
{
"epoch": 0.138,
"grad_norm": 0.08443128317594528,
"learning_rate": 9.912621540634889e-06,
"loss": 0.4357,
"step": 69
},
{
"epoch": 0.14,
"grad_norm": 0.07121642678976059,
"learning_rate": 9.909643486313533e-06,
"loss": 0.3545,
"step": 70
},
{
"epoch": 0.142,
"grad_norm": 0.09408602863550186,
"learning_rate": 9.906615991936781e-06,
"loss": 0.3916,
"step": 71
},
{
"epoch": 0.144,
"grad_norm": 0.05998094752430916,
"learning_rate": 9.903539087991462e-06,
"loss": 0.2739,
"step": 72
},
{
"epoch": 0.146,
"grad_norm": 0.08949826657772064,
"learning_rate": 9.900412805461968e-06,
"loss": 0.3722,
"step": 73
},
{
"epoch": 0.148,
"grad_norm": 0.0731697678565979,
"learning_rate": 9.897237175829927e-06,
"loss": 0.2906,
"step": 74
},
{
"epoch": 0.15,
"grad_norm": 0.07855986058712006,
"learning_rate": 9.894012231073895e-06,
"loss": 0.4149,
"step": 75
},
{
"epoch": 0.152,
"grad_norm": 0.0791892409324646,
"learning_rate": 9.890738003669029e-06,
"loss": 0.4383,
"step": 76
},
{
"epoch": 0.154,
"grad_norm": 0.07980603724718094,
"learning_rate": 9.887414526586764e-06,
"loss": 0.4867,
"step": 77
},
{
"epoch": 0.156,
"grad_norm": 0.08503536880016327,
"learning_rate": 9.884041833294477e-06,
"loss": 0.4644,
"step": 78
},
{
"epoch": 0.158,
"grad_norm": 0.09240555018186569,
"learning_rate": 9.880619957755151e-06,
"loss": 0.3107,
"step": 79
},
{
"epoch": 0.16,
"grad_norm": 0.08195238560438156,
"learning_rate": 9.877148934427037e-06,
"loss": 0.3414,
"step": 80
},
{
"epoch": 0.162,
"grad_norm": 0.09512759745121002,
"learning_rate": 9.873628798263297e-06,
"loss": 0.4745,
"step": 81
},
{
"epoch": 0.164,
"grad_norm": 0.07976000756025314,
"learning_rate": 9.870059584711668e-06,
"loss": 0.3925,
"step": 82
},
{
"epoch": 0.166,
"grad_norm": 0.11229317635297775,
"learning_rate": 9.86644132971409e-06,
"loss": 0.4921,
"step": 83
},
{
"epoch": 0.168,
"grad_norm": 0.07479218393564224,
"learning_rate": 9.862774069706346e-06,
"loss": 0.3607,
"step": 84
},
{
"epoch": 0.17,
"grad_norm": 0.08530927449464798,
"learning_rate": 9.859057841617709e-06,
"loss": 0.4116,
"step": 85
},
{
"epoch": 0.172,
"grad_norm": 0.05544688552618027,
"learning_rate": 9.855292682870552e-06,
"loss": 0.2043,
"step": 86
},
{
"epoch": 0.174,
"grad_norm": 0.08539939671754837,
"learning_rate": 9.851478631379982e-06,
"loss": 0.4437,
"step": 87
},
{
"epoch": 0.176,
"grad_norm": 0.08732863515615463,
"learning_rate": 9.847615725553457e-06,
"loss": 0.4449,
"step": 88
},
{
"epoch": 0.178,
"grad_norm": 0.08848625421524048,
"learning_rate": 9.843704004290393e-06,
"loss": 0.5191,
"step": 89
},
{
"epoch": 0.18,
"grad_norm": 0.1142885684967041,
"learning_rate": 9.839743506981783e-06,
"loss": 0.3788,
"step": 90
},
{
"epoch": 0.182,
"grad_norm": 0.0678037703037262,
"learning_rate": 9.835734273509787e-06,
"loss": 0.3655,
"step": 91
},
{
"epoch": 0.184,
"grad_norm": 0.08179458975791931,
"learning_rate": 9.831676344247343e-06,
"loss": 0.4804,
"step": 92
},
{
"epoch": 0.186,
"grad_norm": 0.10821828246116638,
"learning_rate": 9.827569760057755e-06,
"loss": 0.4946,
"step": 93
},
{
"epoch": 0.188,
"grad_norm": 0.06980521976947784,
"learning_rate": 9.82341456229428e-06,
"loss": 0.3301,
"step": 94
},
{
"epoch": 0.19,
"grad_norm": 0.07966768741607666,
"learning_rate": 9.819210792799711e-06,
"loss": 0.4377,
"step": 95
},
{
"epoch": 0.192,
"grad_norm": 0.08750802278518677,
"learning_rate": 9.814958493905962e-06,
"loss": 0.4137,
"step": 96
},
{
"epoch": 0.194,
"grad_norm": 0.08171187341213226,
"learning_rate": 9.810657708433637e-06,
"loss": 0.5154,
"step": 97
},
{
"epoch": 0.196,
"grad_norm": 0.07627864181995392,
"learning_rate": 9.806308479691595e-06,
"loss": 0.3593,
"step": 98
},
{
"epoch": 0.198,
"grad_norm": 0.07038850337266922,
"learning_rate": 9.801910851476524e-06,
"loss": 0.3882,
"step": 99
},
{
"epoch": 0.2,
"grad_norm": 0.09910848736763,
"learning_rate": 9.797464868072489e-06,
"loss": 0.5034,
"step": 100
},
{
"epoch": 0.202,
"grad_norm": 0.08382704854011536,
"learning_rate": 9.792970574250493e-06,
"loss": 0.4769,
"step": 101
},
{
"epoch": 0.204,
"grad_norm": 0.07511335611343384,
"learning_rate": 9.788428015268027e-06,
"loss": 0.3703,
"step": 102
},
{
"epoch": 0.206,
"grad_norm": 0.08155877888202667,
"learning_rate": 9.78383723686861e-06,
"loss": 0.4102,
"step": 103
},
{
"epoch": 0.208,
"grad_norm": 0.06436574459075928,
"learning_rate": 9.779198285281326e-06,
"loss": 0.3253,
"step": 104
},
{
"epoch": 0.21,
"grad_norm": 0.06901544332504272,
"learning_rate": 9.774511207220369e-06,
"loss": 0.2842,
"step": 105
},
{
"epoch": 0.212,
"grad_norm": 0.08444689959287643,
"learning_rate": 9.769776049884564e-06,
"loss": 0.4212,
"step": 106
},
{
"epoch": 0.214,
"grad_norm": 0.08550014346837997,
"learning_rate": 9.76499286095689e-06,
"loss": 0.4404,
"step": 107
},
{
"epoch": 0.216,
"grad_norm": 0.09659305214881897,
"learning_rate": 9.760161688604008e-06,
"loss": 0.5841,
"step": 108
},
{
"epoch": 0.218,
"grad_norm": 0.06201549619436264,
"learning_rate": 9.755282581475769e-06,
"loss": 0.2246,
"step": 109
},
{
"epoch": 0.22,
"grad_norm": 0.07813581079244614,
"learning_rate": 9.750355588704728e-06,
"loss": 0.4415,
"step": 110
},
{
"epoch": 0.222,
"grad_norm": 0.10021974891424179,
"learning_rate": 9.745380759905648e-06,
"loss": 0.3042,
"step": 111
},
{
"epoch": 0.224,
"grad_norm": 0.10321412235498428,
"learning_rate": 9.740358145174999e-06,
"loss": 0.4837,
"step": 112
},
{
"epoch": 0.226,
"grad_norm": 0.11536537110805511,
"learning_rate": 9.735287795090455e-06,
"loss": 0.5586,
"step": 113
},
{
"epoch": 0.228,
"grad_norm": 0.07521039247512817,
"learning_rate": 9.730169760710385e-06,
"loss": 0.361,
"step": 114
},
{
"epoch": 0.23,
"grad_norm": 0.07128458470106125,
"learning_rate": 9.725004093573343e-06,
"loss": 0.3511,
"step": 115
},
{
"epoch": 0.232,
"grad_norm": 0.08504608273506165,
"learning_rate": 9.719790845697534e-06,
"loss": 0.4472,
"step": 116
},
{
"epoch": 0.234,
"grad_norm": 0.08541107177734375,
"learning_rate": 9.71453006958031e-06,
"loss": 0.3195,
"step": 117
},
{
"epoch": 0.236,
"grad_norm": 0.085638627409935,
"learning_rate": 9.709221818197626e-06,
"loss": 0.4343,
"step": 118
},
{
"epoch": 0.238,
"grad_norm": 0.06405656784772873,
"learning_rate": 9.703866145003512e-06,
"loss": 0.2905,
"step": 119
},
{
"epoch": 0.24,
"grad_norm": 0.12191811949014664,
"learning_rate": 9.698463103929542e-06,
"loss": 0.4092,
"step": 120
},
{
"epoch": 0.242,
"grad_norm": 0.08051154762506485,
"learning_rate": 9.69301274938428e-06,
"loss": 0.3362,
"step": 121
},
{
"epoch": 0.244,
"grad_norm": 0.09473302215337753,
"learning_rate": 9.687515136252732e-06,
"loss": 0.3941,
"step": 122
},
{
"epoch": 0.246,
"grad_norm": 0.09992998838424683,
"learning_rate": 9.681970319895804e-06,
"loss": 0.4603,
"step": 123
},
{
"epoch": 0.248,
"grad_norm": 0.08887780457735062,
"learning_rate": 9.676378356149733e-06,
"loss": 0.3082,
"step": 124
},
{
"epoch": 0.25,
"grad_norm": 0.08823645859956741,
"learning_rate": 9.670739301325534e-06,
"loss": 0.4301,
"step": 125
},
{
"epoch": 0.25,
"eval_loss": 0.3706146478652954,
"eval_runtime": 76.5201,
"eval_samples_per_second": 7.214,
"eval_steps_per_second": 0.902,
"step": 125
},
{
"epoch": 0.252,
"grad_norm": 0.10688935965299606,
"learning_rate": 9.665053212208426e-06,
"loss": 0.3065,
"step": 126
},
{
"epoch": 0.254,
"grad_norm": 0.09517981857061386,
"learning_rate": 9.659320146057263e-06,
"loss": 0.5437,
"step": 127
},
{
"epoch": 0.256,
"grad_norm": 0.11310486495494843,
"learning_rate": 9.653540160603956e-06,
"loss": 0.6087,
"step": 128
},
{
"epoch": 0.258,
"grad_norm": 0.08851969987154007,
"learning_rate": 9.647713314052896e-06,
"loss": 0.3598,
"step": 129
},
{
"epoch": 0.26,
"grad_norm": 0.09503145515918732,
"learning_rate": 9.641839665080363e-06,
"loss": 0.338,
"step": 130
},
{
"epoch": 0.262,
"grad_norm": 0.09553948044776917,
"learning_rate": 9.635919272833938e-06,
"loss": 0.3801,
"step": 131
},
{
"epoch": 0.264,
"grad_norm": 0.09811339527368546,
"learning_rate": 9.629952196931902e-06,
"loss": 0.3866,
"step": 132
},
{
"epoch": 0.266,
"grad_norm": 0.0865439921617508,
"learning_rate": 9.623938497462647e-06,
"loss": 0.4466,
"step": 133
},
{
"epoch": 0.268,
"grad_norm": 0.09298735857009888,
"learning_rate": 9.617878234984056e-06,
"loss": 0.4413,
"step": 134
},
{
"epoch": 0.27,
"grad_norm": 0.10931612551212311,
"learning_rate": 9.611771470522908e-06,
"loss": 0.3974,
"step": 135
},
{
"epoch": 0.272,
"grad_norm": 0.08798681199550629,
"learning_rate": 9.60561826557425e-06,
"loss": 0.4052,
"step": 136
},
{
"epoch": 0.274,
"grad_norm": 0.09892652928829193,
"learning_rate": 9.599418682100793e-06,
"loss": 0.4645,
"step": 137
},
{
"epoch": 0.276,
"grad_norm": 0.10193604230880737,
"learning_rate": 9.59317278253227e-06,
"loss": 0.4064,
"step": 138
},
{
"epoch": 0.278,
"grad_norm": 0.07900392264127731,
"learning_rate": 9.586880629764817e-06,
"loss": 0.3229,
"step": 139
},
{
"epoch": 0.28,
"grad_norm": 0.08284664154052734,
"learning_rate": 9.580542287160348e-06,
"loss": 0.3703,
"step": 140
},
{
"epoch": 0.282,
"grad_norm": 0.08164459466934204,
"learning_rate": 9.574157818545902e-06,
"loss": 0.2879,
"step": 141
},
{
"epoch": 0.284,
"grad_norm": 0.1115422248840332,
"learning_rate": 9.567727288213005e-06,
"loss": 0.4593,
"step": 142
},
{
"epoch": 0.286,
"grad_norm": 0.09770838916301727,
"learning_rate": 9.561250760917026e-06,
"loss": 0.4133,
"step": 143
},
{
"epoch": 0.288,
"grad_norm": 0.12189961224794388,
"learning_rate": 9.554728301876525e-06,
"loss": 0.5928,
"step": 144
},
{
"epoch": 0.29,
"grad_norm": 0.14093732833862305,
"learning_rate": 9.548159976772593e-06,
"loss": 0.415,
"step": 145
},
{
"epoch": 0.292,
"grad_norm": 0.11479732394218445,
"learning_rate": 9.541545851748186e-06,
"loss": 0.3691,
"step": 146
},
{
"epoch": 0.294,
"grad_norm": 0.09249378740787506,
"learning_rate": 9.534885993407474e-06,
"loss": 0.3394,
"step": 147
},
{
"epoch": 0.296,
"grad_norm": 0.10194878280162811,
"learning_rate": 9.528180468815155e-06,
"loss": 0.3745,
"step": 148
},
{
"epoch": 0.298,
"grad_norm": 0.09345925599336624,
"learning_rate": 9.521429345495787e-06,
"loss": 0.3934,
"step": 149
},
{
"epoch": 0.3,
"grad_norm": 0.09919178485870361,
"learning_rate": 9.514632691433108e-06,
"loss": 0.4053,
"step": 150
},
{
"epoch": 0.302,
"grad_norm": 0.10807909071445465,
"learning_rate": 9.507790575069347e-06,
"loss": 0.4631,
"step": 151
},
{
"epoch": 0.304,
"grad_norm": 0.10555636882781982,
"learning_rate": 9.50090306530454e-06,
"loss": 0.4952,
"step": 152
},
{
"epoch": 0.306,
"grad_norm": 0.10507559776306152,
"learning_rate": 9.493970231495836e-06,
"loss": 0.294,
"step": 153
},
{
"epoch": 0.308,
"grad_norm": 0.08718883246183395,
"learning_rate": 9.486992143456792e-06,
"loss": 0.3044,
"step": 154
},
{
"epoch": 0.31,
"grad_norm": 0.10039477050304413,
"learning_rate": 9.47996887145668e-06,
"loss": 0.3736,
"step": 155
},
{
"epoch": 0.312,
"grad_norm": 0.09952064603567123,
"learning_rate": 9.47290048621977e-06,
"loss": 0.4359,
"step": 156
},
{
"epoch": 0.314,
"grad_norm": 0.10663799196481705,
"learning_rate": 9.46578705892462e-06,
"loss": 0.3939,
"step": 157
},
{
"epoch": 0.316,
"grad_norm": 0.10759017616510391,
"learning_rate": 9.458628661203368e-06,
"loss": 0.4575,
"step": 158
},
{
"epoch": 0.318,
"grad_norm": 0.08924371749162674,
"learning_rate": 9.451425365140997e-06,
"loss": 0.3525,
"step": 159
},
{
"epoch": 0.32,
"grad_norm": 0.13670168817043304,
"learning_rate": 9.444177243274619e-06,
"loss": 0.5385,
"step": 160
},
{
"epoch": 0.322,
"grad_norm": 0.10520858317613602,
"learning_rate": 9.43688436859274e-06,
"loss": 0.2964,
"step": 161
},
{
"epoch": 0.324,
"grad_norm": 0.10608810931444168,
"learning_rate": 9.429546814534528e-06,
"loss": 0.4369,
"step": 162
},
{
"epoch": 0.326,
"grad_norm": 0.08399061113595963,
"learning_rate": 9.422164654989073e-06,
"loss": 0.3246,
"step": 163
},
{
"epoch": 0.328,
"grad_norm": 0.11295214295387268,
"learning_rate": 9.414737964294636e-06,
"loss": 0.4766,
"step": 164
},
{
"epoch": 0.33,
"grad_norm": 0.1255977749824524,
"learning_rate": 9.40726681723791e-06,
"loss": 0.5263,
"step": 165
},
{
"epoch": 0.332,
"grad_norm": 0.0891086682677269,
"learning_rate": 9.399751289053267e-06,
"loss": 0.2796,
"step": 166
},
{
"epoch": 0.334,
"grad_norm": 0.12856395542621613,
"learning_rate": 9.392191455421989e-06,
"loss": 0.4485,
"step": 167
},
{
"epoch": 0.336,
"grad_norm": 0.1172974556684494,
"learning_rate": 9.384587392471516e-06,
"loss": 0.542,
"step": 168
},
{
"epoch": 0.338,
"grad_norm": 0.08675208687782288,
"learning_rate": 9.376939176774678e-06,
"loss": 0.2899,
"step": 169
},
{
"epoch": 0.34,
"grad_norm": 0.11079028248786926,
"learning_rate": 9.369246885348926e-06,
"loss": 0.3732,
"step": 170
},
{
"epoch": 0.342,
"grad_norm": 0.12667471170425415,
"learning_rate": 9.361510595655545e-06,
"loss": 0.54,
"step": 171
},
{
"epoch": 0.344,
"grad_norm": 0.08692082017660141,
"learning_rate": 9.353730385598887e-06,
"loss": 0.3873,
"step": 172
},
{
"epoch": 0.346,
"grad_norm": 0.1013069748878479,
"learning_rate": 9.345906333525582e-06,
"loss": 0.438,
"step": 173
},
{
"epoch": 0.348,
"grad_norm": 0.09999188780784607,
"learning_rate": 9.338038518223746e-06,
"loss": 0.4467,
"step": 174
},
{
"epoch": 0.35,
"grad_norm": 0.11317498981952667,
"learning_rate": 9.330127018922195e-06,
"loss": 0.3912,
"step": 175
},
{
"epoch": 0.352,
"grad_norm": 0.10574603080749512,
"learning_rate": 9.322171915289635e-06,
"loss": 0.3808,
"step": 176
},
{
"epoch": 0.354,
"grad_norm": 0.1281527876853943,
"learning_rate": 9.314173287433874e-06,
"loss": 0.423,
"step": 177
},
{
"epoch": 0.356,
"grad_norm": 0.12899580597877502,
"learning_rate": 9.306131215901004e-06,
"loss": 0.4509,
"step": 178
},
{
"epoch": 0.358,
"grad_norm": 0.10952267050743103,
"learning_rate": 9.298045781674595e-06,
"loss": 0.3512,
"step": 179
},
{
"epoch": 0.36,
"grad_norm": 0.1423255354166031,
"learning_rate": 9.289917066174887e-06,
"loss": 0.3631,
"step": 180
},
{
"epoch": 0.362,
"grad_norm": 0.13039131462574005,
"learning_rate": 9.281745151257946e-06,
"loss": 0.3762,
"step": 181
},
{
"epoch": 0.364,
"grad_norm": 0.10448655486106873,
"learning_rate": 9.273530119214868e-06,
"loss": 0.3694,
"step": 182
},
{
"epoch": 0.366,
"grad_norm": 0.0945306122303009,
"learning_rate": 9.265272052770936e-06,
"loss": 0.28,
"step": 183
},
{
"epoch": 0.368,
"grad_norm": 0.10995735973119736,
"learning_rate": 9.256971035084786e-06,
"loss": 0.4849,
"step": 184
},
{
"epoch": 0.37,
"grad_norm": 0.11014600843191147,
"learning_rate": 9.248627149747573e-06,
"loss": 0.3213,
"step": 185
},
{
"epoch": 0.372,
"grad_norm": 0.09283925592899323,
"learning_rate": 9.24024048078213e-06,
"loss": 0.4077,
"step": 186
},
{
"epoch": 0.374,
"grad_norm": 0.14395715296268463,
"learning_rate": 9.231811112642121e-06,
"loss": 0.4869,
"step": 187
},
{
"epoch": 0.376,
"grad_norm": 0.10785488784313202,
"learning_rate": 9.223339130211194e-06,
"loss": 0.4122,
"step": 188
},
{
"epoch": 0.378,
"grad_norm": 0.09983161091804504,
"learning_rate": 9.214824618802108e-06,
"loss": 0.3027,
"step": 189
},
{
"epoch": 0.38,
"grad_norm": 0.10121427476406097,
"learning_rate": 9.206267664155906e-06,
"loss": 0.3055,
"step": 190
},
{
"epoch": 0.382,
"grad_norm": 0.11393419653177261,
"learning_rate": 9.197668352441025e-06,
"loss": 0.3567,
"step": 191
},
{
"epoch": 0.384,
"grad_norm": 0.132842019200325,
"learning_rate": 9.189026770252437e-06,
"loss": 0.3556,
"step": 192
},
{
"epoch": 0.386,
"grad_norm": 0.1139449030160904,
"learning_rate": 9.18034300461078e-06,
"loss": 0.4298,
"step": 193
},
{
"epoch": 0.388,
"grad_norm": 0.09980877488851547,
"learning_rate": 9.171617142961477e-06,
"loss": 0.3853,
"step": 194
},
{
"epoch": 0.39,
"grad_norm": 0.12531818449497223,
"learning_rate": 9.162849273173857e-06,
"loss": 0.4845,
"step": 195
},
{
"epoch": 0.392,
"grad_norm": 0.11148197203874588,
"learning_rate": 9.154039483540273e-06,
"loss": 0.4091,
"step": 196
},
{
"epoch": 0.394,
"grad_norm": 0.11962081491947174,
"learning_rate": 9.145187862775208e-06,
"loss": 0.371,
"step": 197
},
{
"epoch": 0.396,
"grad_norm": 0.10789982974529266,
"learning_rate": 9.136294500014387e-06,
"loss": 0.4268,
"step": 198
},
{
"epoch": 0.398,
"grad_norm": 0.15846121311187744,
"learning_rate": 9.12735948481387e-06,
"loss": 0.6264,
"step": 199
},
{
"epoch": 0.4,
"grad_norm": 0.1426246613264084,
"learning_rate": 9.118382907149164e-06,
"loss": 0.4769,
"step": 200
},
{
"epoch": 0.402,
"grad_norm": 0.1069459393620491,
"learning_rate": 9.109364857414306e-06,
"loss": 0.3708,
"step": 201
},
{
"epoch": 0.404,
"grad_norm": 0.10732389986515045,
"learning_rate": 9.100305426420957e-06,
"loss": 0.3962,
"step": 202
},
{
"epoch": 0.406,
"grad_norm": 0.1436106562614441,
"learning_rate": 9.091204705397485e-06,
"loss": 0.4549,
"step": 203
},
{
"epoch": 0.408,
"grad_norm": 0.10230587422847748,
"learning_rate": 9.08206278598805e-06,
"loss": 0.3926,
"step": 204
},
{
"epoch": 0.41,
"grad_norm": 0.11367027461528778,
"learning_rate": 9.07287976025168e-06,
"loss": 0.3378,
"step": 205
},
{
"epoch": 0.412,
"grad_norm": 0.14832234382629395,
"learning_rate": 9.06365572066134e-06,
"loss": 0.4202,
"step": 206
},
{
"epoch": 0.414,
"grad_norm": 0.10567332804203033,
"learning_rate": 9.05439076010301e-06,
"loss": 0.2904,
"step": 207
},
{
"epoch": 0.416,
"grad_norm": 0.11918513476848602,
"learning_rate": 9.045084971874738e-06,
"loss": 0.2632,
"step": 208
},
{
"epoch": 0.418,
"grad_norm": 0.13223537802696228,
"learning_rate": 9.035738449685707e-06,
"loss": 0.4208,
"step": 209
},
{
"epoch": 0.42,
"grad_norm": 0.12573251128196716,
"learning_rate": 9.026351287655294e-06,
"loss": 0.4609,
"step": 210
},
{
"epoch": 0.422,
"grad_norm": 0.11943136155605316,
"learning_rate": 9.016923580312114e-06,
"loss": 0.3323,
"step": 211
},
{
"epoch": 0.424,
"grad_norm": 0.13152974843978882,
"learning_rate": 9.007455422593077e-06,
"loss": 0.4258,
"step": 212
},
{
"epoch": 0.426,
"grad_norm": 0.13339808583259583,
"learning_rate": 8.997946909842426e-06,
"loss": 0.5303,
"step": 213
},
{
"epoch": 0.428,
"grad_norm": 0.11746034771203995,
"learning_rate": 8.988398137810778e-06,
"loss": 0.4109,
"step": 214
},
{
"epoch": 0.43,
"grad_norm": 0.11518029868602753,
"learning_rate": 8.978809202654161e-06,
"loss": 0.4154,
"step": 215
},
{
"epoch": 0.432,
"grad_norm": 0.15307952463626862,
"learning_rate": 8.969180200933048e-06,
"loss": 0.4196,
"step": 216
},
{
"epoch": 0.434,
"grad_norm": 0.11385340988636017,
"learning_rate": 8.959511229611377e-06,
"loss": 0.3713,
"step": 217
},
{
"epoch": 0.436,
"grad_norm": 0.1380355805158615,
"learning_rate": 8.949802386055582e-06,
"loss": 0.3891,
"step": 218
},
{
"epoch": 0.438,
"grad_norm": 0.09614066779613495,
"learning_rate": 8.94005376803361e-06,
"loss": 0.2527,
"step": 219
},
{
"epoch": 0.44,
"grad_norm": 0.12352288514375687,
"learning_rate": 8.930265473713939e-06,
"loss": 0.3737,
"step": 220
},
{
"epoch": 0.442,
"grad_norm": 0.18210633099079132,
"learning_rate": 8.92043760166458e-06,
"loss": 0.3839,
"step": 221
},
{
"epoch": 0.444,
"grad_norm": 0.1087498739361763,
"learning_rate": 8.910570250852098e-06,
"loss": 0.3141,
"step": 222
},
{
"epoch": 0.446,
"grad_norm": 0.11985889822244644,
"learning_rate": 8.900663520640605e-06,
"loss": 0.4606,
"step": 223
},
{
"epoch": 0.448,
"grad_norm": 0.146299347281456,
"learning_rate": 8.890717510790763e-06,
"loss": 0.4094,
"step": 224
},
{
"epoch": 0.45,
"grad_norm": 0.09788361191749573,
"learning_rate": 8.880732321458785e-06,
"loss": 0.2964,
"step": 225
},
{
"epoch": 0.452,
"grad_norm": 0.09735774993896484,
"learning_rate": 8.870708053195414e-06,
"loss": 0.2646,
"step": 226
},
{
"epoch": 0.454,
"grad_norm": 0.1293504238128662,
"learning_rate": 8.860644806944917e-06,
"loss": 0.2991,
"step": 227
},
{
"epoch": 0.456,
"grad_norm": 0.13126921653747559,
"learning_rate": 8.850542684044078e-06,
"loss": 0.4474,
"step": 228
},
{
"epoch": 0.458,
"grad_norm": 0.11488878726959229,
"learning_rate": 8.84040178622116e-06,
"loss": 0.3628,
"step": 229
},
{
"epoch": 0.46,
"grad_norm": 0.13861073553562164,
"learning_rate": 8.83022221559489e-06,
"loss": 0.4022,
"step": 230
},
{
"epoch": 0.462,
"grad_norm": 0.16164664924144745,
"learning_rate": 8.820004074673433e-06,
"loss": 0.4217,
"step": 231
},
{
"epoch": 0.464,
"grad_norm": 0.10550030320882797,
"learning_rate": 8.809747466353356e-06,
"loss": 0.2927,
"step": 232
},
{
"epoch": 0.466,
"grad_norm": 0.1035122275352478,
"learning_rate": 8.799452493918586e-06,
"loss": 0.2453,
"step": 233
},
{
"epoch": 0.468,
"grad_norm": 0.15530018508434296,
"learning_rate": 8.789119261039385e-06,
"loss": 0.3758,
"step": 234
},
{
"epoch": 0.47,
"grad_norm": 0.13951483368873596,
"learning_rate": 8.778747871771293e-06,
"loss": 0.4502,
"step": 235
},
{
"epoch": 0.472,
"grad_norm": 0.13241475820541382,
"learning_rate": 8.768338430554083e-06,
"loss": 0.5012,
"step": 236
},
{
"epoch": 0.474,
"grad_norm": 0.11370962113142014,
"learning_rate": 8.757891042210713e-06,
"loss": 0.2801,
"step": 237
},
{
"epoch": 0.476,
"grad_norm": 0.1501305103302002,
"learning_rate": 8.747405811946272e-06,
"loss": 0.4888,
"step": 238
},
{
"epoch": 0.478,
"grad_norm": 0.1636514514684677,
"learning_rate": 8.736882845346906e-06,
"loss": 0.518,
"step": 239
},
{
"epoch": 0.48,
"grad_norm": 0.11505798250436783,
"learning_rate": 8.726322248378775e-06,
"loss": 0.2627,
"step": 240
},
{
"epoch": 0.482,
"grad_norm": 0.15717971324920654,
"learning_rate": 8.715724127386971e-06,
"loss": 0.3299,
"step": 241
},
{
"epoch": 0.484,
"grad_norm": 0.13042742013931274,
"learning_rate": 8.705088589094458e-06,
"loss": 0.351,
"step": 242
},
{
"epoch": 0.486,
"grad_norm": 0.1414385885000229,
"learning_rate": 8.69441574060099e-06,
"loss": 0.471,
"step": 243
},
{
"epoch": 0.488,
"grad_norm": 0.10110446810722351,
"learning_rate": 8.683705689382025e-06,
"loss": 0.2369,
"step": 244
},
{
"epoch": 0.49,
"grad_norm": 0.1549258530139923,
"learning_rate": 8.672958543287666e-06,
"loss": 0.4333,
"step": 245
},
{
"epoch": 0.492,
"grad_norm": 0.11834664642810822,
"learning_rate": 8.662174410541556e-06,
"loss": 0.3182,
"step": 246
},
{
"epoch": 0.494,
"grad_norm": 0.1529727429151535,
"learning_rate": 8.651353399739787e-06,
"loss": 0.4963,
"step": 247
},
{
"epoch": 0.496,
"grad_norm": 0.14854104816913605,
"learning_rate": 8.640495619849821e-06,
"loss": 0.4514,
"step": 248
},
{
"epoch": 0.498,
"grad_norm": 0.12271202355623245,
"learning_rate": 8.629601180209382e-06,
"loss": 0.3694,
"step": 249
},
{
"epoch": 0.5,
"grad_norm": 0.11352905631065369,
"learning_rate": 8.61867019052535e-06,
"loss": 0.2978,
"step": 250
},
{
"epoch": 0.5,
"eval_loss": 0.32808247208595276,
"eval_runtime": 76.51,
"eval_samples_per_second": 7.215,
"eval_steps_per_second": 0.902,
"step": 250
},
{
"epoch": 0.502,
"grad_norm": 0.1511523425579071,
"learning_rate": 8.607702760872679e-06,
"loss": 0.4037,
"step": 251
},
{
"epoch": 0.504,
"grad_norm": 0.13344620168209076,
"learning_rate": 8.596699001693257e-06,
"loss": 0.2303,
"step": 252
},
{
"epoch": 0.506,
"grad_norm": 0.12220989167690277,
"learning_rate": 8.585659023794818e-06,
"loss": 0.4347,
"step": 253
},
{
"epoch": 0.508,
"grad_norm": 0.1094481498003006,
"learning_rate": 8.574582938349818e-06,
"loss": 0.3089,
"step": 254
},
{
"epoch": 0.51,
"grad_norm": 0.11940666288137436,
"learning_rate": 8.563470856894316e-06,
"loss": 0.2699,
"step": 255
},
{
"epoch": 0.512,
"grad_norm": 0.139656201004982,
"learning_rate": 8.552322891326846e-06,
"loss": 0.2763,
"step": 256
},
{
"epoch": 0.514,
"grad_norm": 0.11665194481611252,
"learning_rate": 8.541139153907296e-06,
"loss": 0.2695,
"step": 257
},
{
"epoch": 0.516,
"grad_norm": 0.12714596092700958,
"learning_rate": 8.529919757255783e-06,
"loss": 0.2489,
"step": 258
},
{
"epoch": 0.518,
"grad_norm": 0.12326015532016754,
"learning_rate": 8.518664814351502e-06,
"loss": 0.3067,
"step": 259
},
{
"epoch": 0.52,
"grad_norm": 0.13826797902584076,
"learning_rate": 8.507374438531606e-06,
"loss": 0.3119,
"step": 260
},
{
"epoch": 0.522,
"grad_norm": 0.15031856298446655,
"learning_rate": 8.496048743490053e-06,
"loss": 0.3112,
"step": 261
},
{
"epoch": 0.524,
"grad_norm": 0.14100715517997742,
"learning_rate": 8.48468784327647e-06,
"loss": 0.3878,
"step": 262
},
{
"epoch": 0.526,
"grad_norm": 0.15813864767551422,
"learning_rate": 8.473291852294986e-06,
"loss": 0.3382,
"step": 263
},
{
"epoch": 0.528,
"grad_norm": 0.15911728143692017,
"learning_rate": 8.461860885303116e-06,
"loss": 0.4177,
"step": 264
},
{
"epoch": 0.53,
"grad_norm": 0.15685637295246124,
"learning_rate": 8.450395057410561e-06,
"loss": 0.3557,
"step": 265
},
{
"epoch": 0.532,
"grad_norm": 0.13905856013298035,
"learning_rate": 8.438894484078086e-06,
"loss": 0.3323,
"step": 266
},
{
"epoch": 0.534,
"grad_norm": 0.13344989717006683,
"learning_rate": 8.427359281116335e-06,
"loss": 0.3475,
"step": 267
},
{
"epoch": 0.536,
"grad_norm": 0.16016146540641785,
"learning_rate": 8.415789564684673e-06,
"loss": 0.3789,
"step": 268
},
{
"epoch": 0.538,
"grad_norm": 0.11681054532527924,
"learning_rate": 8.404185451290017e-06,
"loss": 0.2061,
"step": 269
},
{
"epoch": 0.54,
"grad_norm": 0.14662593603134155,
"learning_rate": 8.392547057785662e-06,
"loss": 0.4173,
"step": 270
},
{
"epoch": 0.542,
"grad_norm": 0.21970625221729279,
"learning_rate": 8.380874501370098e-06,
"loss": 0.5602,
"step": 271
},
{
"epoch": 0.544,
"grad_norm": 0.11630596220493317,
"learning_rate": 8.36916789958584e-06,
"loss": 0.2674,
"step": 272
},
{
"epoch": 0.546,
"grad_norm": 0.14212217926979065,
"learning_rate": 8.357427370318239e-06,
"loss": 0.2776,
"step": 273
},
{
"epoch": 0.548,
"grad_norm": 0.14911417663097382,
"learning_rate": 8.345653031794292e-06,
"loss": 0.4463,
"step": 274
},
{
"epoch": 0.55,
"grad_norm": 0.142579585313797,
"learning_rate": 8.33384500258146e-06,
"loss": 0.4963,
"step": 275
},
{
"epoch": 0.552,
"grad_norm": 0.14713557064533234,
"learning_rate": 8.322003401586463e-06,
"loss": 0.2642,
"step": 276
},
{
"epoch": 0.554,
"grad_norm": 0.24756528437137604,
"learning_rate": 8.310128348054093e-06,
"loss": 0.5423,
"step": 277
},
{
"epoch": 0.556,
"grad_norm": 0.13731062412261963,
"learning_rate": 8.298219961566008e-06,
"loss": 0.3333,
"step": 278
},
{
"epoch": 0.558,
"grad_norm": 0.18075144290924072,
"learning_rate": 8.286278362039527e-06,
"loss": 0.3733,
"step": 279
},
{
"epoch": 0.56,
"grad_norm": 0.1650344282388687,
"learning_rate": 8.274303669726427e-06,
"loss": 0.383,
"step": 280
},
{
"epoch": 0.562,
"grad_norm": 0.18053463101387024,
"learning_rate": 8.262296005211722e-06,
"loss": 0.4359,
"step": 281
},
{
"epoch": 0.564,
"grad_norm": 0.16192179918289185,
"learning_rate": 8.250255489412464e-06,
"loss": 0.3839,
"step": 282
},
{
"epoch": 0.566,
"grad_norm": 0.16045285761356354,
"learning_rate": 8.238182243576512e-06,
"loss": 0.4185,
"step": 283
},
{
"epoch": 0.568,
"grad_norm": 0.14847232401371002,
"learning_rate": 8.226076389281316e-06,
"loss": 0.43,
"step": 284
},
{
"epoch": 0.57,
"grad_norm": 0.1868700236082077,
"learning_rate": 8.213938048432697e-06,
"loss": 0.3437,
"step": 285
},
{
"epoch": 0.572,
"grad_norm": 0.1744498908519745,
"learning_rate": 8.201767343263612e-06,
"loss": 0.4926,
"step": 286
},
{
"epoch": 0.574,
"grad_norm": 0.13156633079051971,
"learning_rate": 8.189564396332927e-06,
"loss": 0.4245,
"step": 287
},
{
"epoch": 0.576,
"grad_norm": 0.17716287076473236,
"learning_rate": 8.177329330524182e-06,
"loss": 0.3134,
"step": 288
},
{
"epoch": 0.578,
"grad_norm": 0.15387575328350067,
"learning_rate": 8.165062269044353e-06,
"loss": 0.3723,
"step": 289
},
{
"epoch": 0.58,
"grad_norm": 0.11926203221082687,
"learning_rate": 8.152763335422612e-06,
"loss": 0.251,
"step": 290
},
{
"epoch": 0.582,
"grad_norm": 0.14692164957523346,
"learning_rate": 8.140432653509089e-06,
"loss": 0.3068,
"step": 291
},
{
"epoch": 0.584,
"grad_norm": 0.12874449789524078,
"learning_rate": 8.128070347473609e-06,
"loss": 0.3449,
"step": 292
},
{
"epoch": 0.586,
"grad_norm": 0.1284901350736618,
"learning_rate": 8.115676541804456e-06,
"loss": 0.2336,
"step": 293
},
{
"epoch": 0.588,
"grad_norm": 0.18448615074157715,
"learning_rate": 8.10325136130712e-06,
"loss": 0.4497,
"step": 294
},
{
"epoch": 0.59,
"grad_norm": 0.18793466687202454,
"learning_rate": 8.090794931103026e-06,
"loss": 0.446,
"step": 295
},
{
"epoch": 0.592,
"grad_norm": 0.11833447217941284,
"learning_rate": 8.078307376628292e-06,
"loss": 0.286,
"step": 296
},
{
"epoch": 0.594,
"grad_norm": 0.14963407814502716,
"learning_rate": 8.065788823632451e-06,
"loss": 0.329,
"step": 297
},
{
"epoch": 0.596,
"grad_norm": 0.1394645869731903,
"learning_rate": 8.053239398177191e-06,
"loss": 0.2671,
"step": 298
},
{
"epoch": 0.598,
"grad_norm": 0.17401300370693207,
"learning_rate": 8.04065922663509e-06,
"loss": 0.5106,
"step": 299
},
{
"epoch": 0.6,
"grad_norm": 0.1559733897447586,
"learning_rate": 8.028048435688333e-06,
"loss": 0.259,
"step": 300
},
{
"epoch": 0.602,
"grad_norm": 0.14853116869926453,
"learning_rate": 8.015407152327448e-06,
"loss": 0.4095,
"step": 301
},
{
"epoch": 0.604,
"grad_norm": 0.13665775954723358,
"learning_rate": 8.002735503850016e-06,
"loss": 0.379,
"step": 302
},
{
"epoch": 0.606,
"grad_norm": 0.15187975764274597,
"learning_rate": 7.990033617859396e-06,
"loss": 0.336,
"step": 303
},
{
"epoch": 0.608,
"grad_norm": 0.17993216216564178,
"learning_rate": 7.97730162226344e-06,
"loss": 0.4718,
"step": 304
},
{
"epoch": 0.61,
"grad_norm": 0.14840970933437347,
"learning_rate": 7.964539645273204e-06,
"loss": 0.3572,
"step": 305
},
{
"epoch": 0.612,
"grad_norm": 0.2386975884437561,
"learning_rate": 7.951747815401651e-06,
"loss": 0.3185,
"step": 306
},
{
"epoch": 0.614,
"grad_norm": 0.21291233599185944,
"learning_rate": 7.938926261462366e-06,
"loss": 0.362,
"step": 307
},
{
"epoch": 0.616,
"grad_norm": 0.16196957230567932,
"learning_rate": 7.92607511256826e-06,
"loss": 0.3024,
"step": 308
},
{
"epoch": 0.618,
"grad_norm": 0.2727487087249756,
"learning_rate": 7.913194498130252e-06,
"loss": 0.5212,
"step": 309
},
{
"epoch": 0.62,
"grad_norm": 0.1640804558992386,
"learning_rate": 7.900284547855992e-06,
"loss": 0.3948,
"step": 310
},
{
"epoch": 0.622,
"grad_norm": 0.22003543376922607,
"learning_rate": 7.887345391748533e-06,
"loss": 0.3745,
"step": 311
},
{
"epoch": 0.624,
"grad_norm": 0.1896262764930725,
"learning_rate": 7.874377160105037e-06,
"loss": 0.4448,
"step": 312
},
{
"epoch": 0.626,
"grad_norm": 0.18609432876110077,
"learning_rate": 7.861379983515449e-06,
"loss": 0.3685,
"step": 313
},
{
"epoch": 0.628,
"grad_norm": 0.14590106904506683,
"learning_rate": 7.848353992861195e-06,
"loss": 0.3338,
"step": 314
},
{
"epoch": 0.63,
"grad_norm": 0.13211271166801453,
"learning_rate": 7.835299319313854e-06,
"loss": 0.3297,
"step": 315
},
{
"epoch": 0.632,
"grad_norm": 0.16736850142478943,
"learning_rate": 7.822216094333847e-06,
"loss": 0.3118,
"step": 316
},
{
"epoch": 0.634,
"grad_norm": 0.17553502321243286,
"learning_rate": 7.8091044496691e-06,
"loss": 0.3447,
"step": 317
},
{
"epoch": 0.636,
"grad_norm": 0.17292480170726776,
"learning_rate": 7.795964517353734e-06,
"loss": 0.3152,
"step": 318
},
{
"epoch": 0.638,
"grad_norm": 0.13962873816490173,
"learning_rate": 7.782796429706721e-06,
"loss": 0.2142,
"step": 319
},
{
"epoch": 0.64,
"grad_norm": 0.19501662254333496,
"learning_rate": 7.769600319330553e-06,
"loss": 0.3923,
"step": 320
},
{
"epoch": 0.642,
"grad_norm": 0.1338018923997879,
"learning_rate": 7.756376319109917e-06,
"loss": 0.3381,
"step": 321
},
{
"epoch": 0.644,
"grad_norm": 0.1579694300889969,
"learning_rate": 7.743124562210351e-06,
"loss": 0.37,
"step": 322
},
{
"epoch": 0.646,
"grad_norm": 0.12136895209550858,
"learning_rate": 7.729845182076896e-06,
"loss": 0.212,
"step": 323
},
{
"epoch": 0.648,
"grad_norm": 0.2188921570777893,
"learning_rate": 7.716538312432767e-06,
"loss": 0.3732,
"step": 324
},
{
"epoch": 0.65,
"grad_norm": 0.1570715606212616,
"learning_rate": 7.703204087277989e-06,
"loss": 0.321,
"step": 325
},
{
"epoch": 0.652,
"grad_norm": 0.19729937613010406,
"learning_rate": 7.689842640888063e-06,
"loss": 0.3955,
"step": 326
},
{
"epoch": 0.654,
"grad_norm": 0.20023679733276367,
"learning_rate": 7.676454107812608e-06,
"loss": 0.4399,
"step": 327
},
{
"epoch": 0.656,
"grad_norm": 0.14793503284454346,
"learning_rate": 7.663038622873999e-06,
"loss": 0.2922,
"step": 328
},
{
"epoch": 0.658,
"grad_norm": 0.16386426985263824,
"learning_rate": 7.649596321166024e-06,
"loss": 0.3495,
"step": 329
},
{
"epoch": 0.66,
"grad_norm": 0.15845847129821777,
"learning_rate": 7.636127338052513e-06,
"loss": 0.3607,
"step": 330
},
{
"epoch": 0.662,
"grad_norm": 0.17752616107463837,
"learning_rate": 7.622631809165972e-06,
"loss": 0.2863,
"step": 331
},
{
"epoch": 0.664,
"grad_norm": 0.2213558405637741,
"learning_rate": 7.60910987040623e-06,
"loss": 0.4411,
"step": 332
},
{
"epoch": 0.666,
"grad_norm": 0.2018650323152542,
"learning_rate": 7.595561657939061e-06,
"loss": 0.418,
"step": 333
},
{
"epoch": 0.668,
"grad_norm": 0.20029357075691223,
"learning_rate": 7.5819873081948105e-06,
"loss": 0.3025,
"step": 334
},
{
"epoch": 0.67,
"grad_norm": 0.1478874832391739,
"learning_rate": 7.568386957867033e-06,
"loss": 0.2437,
"step": 335
},
{
"epoch": 0.672,
"grad_norm": 0.18909971415996552,
"learning_rate": 7.554760743911104e-06,
"loss": 0.3974,
"step": 336
},
{
"epoch": 0.674,
"grad_norm": 0.16544924676418304,
"learning_rate": 7.541108803542846e-06,
"loss": 0.336,
"step": 337
},
{
"epoch": 0.676,
"grad_norm": 0.19204874336719513,
"learning_rate": 7.527431274237149e-06,
"loss": 0.3617,
"step": 338
},
{
"epoch": 0.678,
"grad_norm": 0.1770397573709488,
"learning_rate": 7.5137282937265796e-06,
"loss": 0.3617,
"step": 339
},
{
"epoch": 0.68,
"grad_norm": 0.15880927443504333,
"learning_rate": 7.500000000000001e-06,
"loss": 0.2993,
"step": 340
},
{
"epoch": 0.682,
"grad_norm": 0.4031960368156433,
"learning_rate": 7.486246531301178e-06,
"loss": 0.3137,
"step": 341
},
{
"epoch": 0.684,
"grad_norm": 0.17426829040050507,
"learning_rate": 7.472468026127385e-06,
"loss": 0.3712,
"step": 342
},
{
"epoch": 0.686,
"grad_norm": 0.16782499849796295,
"learning_rate": 7.45866462322802e-06,
"loss": 0.359,
"step": 343
},
{
"epoch": 0.688,
"grad_norm": 0.20207028090953827,
"learning_rate": 7.444836461603195e-06,
"loss": 0.4301,
"step": 344
},
{
"epoch": 0.69,
"grad_norm": 0.18788397312164307,
"learning_rate": 7.430983680502344e-06,
"loss": 0.3609,
"step": 345
},
{
"epoch": 0.692,
"grad_norm": 0.16447116434574127,
"learning_rate": 7.4171064194228196e-06,
"loss": 0.3514,
"step": 346
},
{
"epoch": 0.694,
"grad_norm": 0.15939724445343018,
"learning_rate": 7.403204818108487e-06,
"loss": 0.2747,
"step": 347
},
{
"epoch": 0.696,
"grad_norm": 0.2825759947299957,
"learning_rate": 7.3892790165483164e-06,
"loss": 0.5376,
"step": 348
},
{
"epoch": 0.698,
"grad_norm": 0.15753747522830963,
"learning_rate": 7.3753291549749764e-06,
"loss": 0.2741,
"step": 349
},
{
"epoch": 0.7,
"grad_norm": 0.19103243947029114,
"learning_rate": 7.361355373863415e-06,
"loss": 0.3088,
"step": 350
},
{
"epoch": 0.702,
"grad_norm": 0.18185654282569885,
"learning_rate": 7.347357813929455e-06,
"loss": 0.3204,
"step": 351
},
{
"epoch": 0.704,
"grad_norm": 0.15075427293777466,
"learning_rate": 7.333336616128369e-06,
"loss": 0.2885,
"step": 352
},
{
"epoch": 0.706,
"grad_norm": 0.14092062413692474,
"learning_rate": 7.319291921653464e-06,
"loss": 0.2423,
"step": 353
},
{
"epoch": 0.708,
"grad_norm": 0.11944609135389328,
"learning_rate": 7.305223871934657e-06,
"loss": 0.1367,
"step": 354
},
{
"epoch": 0.71,
"grad_norm": 0.2248326539993286,
"learning_rate": 7.291132608637053e-06,
"loss": 0.4119,
"step": 355
},
{
"epoch": 0.712,
"grad_norm": 0.1844269186258316,
"learning_rate": 7.2770182736595164e-06,
"loss": 0.2714,
"step": 356
},
{
"epoch": 0.714,
"grad_norm": 0.19066232442855835,
"learning_rate": 7.262881009133242e-06,
"loss": 0.432,
"step": 357
},
{
"epoch": 0.716,
"grad_norm": 0.21767167747020721,
"learning_rate": 7.24872095742033e-06,
"loss": 0.3804,
"step": 358
},
{
"epoch": 0.718,
"grad_norm": 0.14823076128959656,
"learning_rate": 7.234538261112342e-06,
"loss": 0.3182,
"step": 359
},
{
"epoch": 0.72,
"grad_norm": 0.1661371886730194,
"learning_rate": 7.2203330630288714e-06,
"loss": 0.3078,
"step": 360
},
{
"epoch": 0.722,
"grad_norm": 0.18412846326828003,
"learning_rate": 7.206105506216107e-06,
"loss": 0.4066,
"step": 361
},
{
"epoch": 0.724,
"grad_norm": 0.17892518639564514,
"learning_rate": 7.191855733945388e-06,
"loss": 0.4772,
"step": 362
},
{
"epoch": 0.726,
"grad_norm": 0.24270282685756683,
"learning_rate": 7.177583889711763e-06,
"loss": 0.3902,
"step": 363
},
{
"epoch": 0.728,
"grad_norm": 0.187135249376297,
"learning_rate": 7.163290117232542e-06,
"loss": 0.3154,
"step": 364
},
{
"epoch": 0.73,
"grad_norm": 0.20502962172031403,
"learning_rate": 7.148974560445859e-06,
"loss": 0.3599,
"step": 365
},
{
"epoch": 0.732,
"grad_norm": 0.1704569160938263,
"learning_rate": 7.1346373635092095e-06,
"loss": 0.3705,
"step": 366
},
{
"epoch": 0.734,
"grad_norm": 0.20562830567359924,
"learning_rate": 7.12027867079801e-06,
"loss": 0.3169,
"step": 367
},
{
"epoch": 0.736,
"grad_norm": 0.19051577150821686,
"learning_rate": 7.105898626904134e-06,
"loss": 0.4571,
"step": 368
},
{
"epoch": 0.738,
"grad_norm": 0.18842366337776184,
"learning_rate": 7.0914973766344645e-06,
"loss": 0.2771,
"step": 369
},
{
"epoch": 0.74,
"grad_norm": 0.14864154160022736,
"learning_rate": 7.0770750650094335e-06,
"loss": 0.2184,
"step": 370
},
{
"epoch": 0.742,
"grad_norm": 0.1662212610244751,
"learning_rate": 7.062631837261556e-06,
"loss": 0.2706,
"step": 371
},
{
"epoch": 0.744,
"grad_norm": 0.15230734646320343,
"learning_rate": 7.048167838833977e-06,
"loss": 0.2611,
"step": 372
},
{
"epoch": 0.746,
"grad_norm": 0.16176356375217438,
"learning_rate": 7.033683215379002e-06,
"loss": 0.3144,
"step": 373
},
{
"epoch": 0.748,
"grad_norm": 0.16796669363975525,
"learning_rate": 7.019178112756625e-06,
"loss": 0.3742,
"step": 374
},
{
"epoch": 0.75,
"grad_norm": 0.16455894708633423,
"learning_rate": 7.004652677033069e-06,
"loss": 0.2426,
"step": 375
},
{
"epoch": 0.75,
"eval_loss": 0.2979236841201782,
"eval_runtime": 76.5795,
"eval_samples_per_second": 7.208,
"eval_steps_per_second": 0.901,
"step": 375
},
{
"epoch": 0.752,
"grad_norm": 0.22792088985443115,
"learning_rate": 6.990107054479313e-06,
"loss": 0.319,
"step": 376
},
{
"epoch": 0.754,
"grad_norm": 0.24258168041706085,
"learning_rate": 6.9755413915696105e-06,
"loss": 0.5036,
"step": 377
},
{
"epoch": 0.756,
"grad_norm": 0.17646639049053192,
"learning_rate": 6.960955834980028e-06,
"loss": 0.3024,
"step": 378
},
{
"epoch": 0.758,
"grad_norm": 0.15006083250045776,
"learning_rate": 6.946350531586959e-06,
"loss": 0.2702,
"step": 379
},
{
"epoch": 0.76,
"grad_norm": 0.15430916845798492,
"learning_rate": 6.931725628465643e-06,
"loss": 0.2492,
"step": 380
},
{
"epoch": 0.762,
"grad_norm": 0.13274860382080078,
"learning_rate": 6.917081272888697e-06,
"loss": 0.2188,
"step": 381
},
{
"epoch": 0.764,
"grad_norm": 0.12552917003631592,
"learning_rate": 6.902417612324615e-06,
"loss": 0.2275,
"step": 382
},
{
"epoch": 0.766,
"grad_norm": 0.14306232333183289,
"learning_rate": 6.887734794436301e-06,
"loss": 0.3204,
"step": 383
},
{
"epoch": 0.768,
"grad_norm": 0.18567156791687012,
"learning_rate": 6.873032967079562e-06,
"loss": 0.4079,
"step": 384
},
{
"epoch": 0.77,
"grad_norm": 0.18761208653450012,
"learning_rate": 6.858312278301638e-06,
"loss": 0.2944,
"step": 385
},
{
"epoch": 0.772,
"grad_norm": 0.18265055119991302,
"learning_rate": 6.8435728763397045e-06,
"loss": 0.4399,
"step": 386
},
{
"epoch": 0.774,
"grad_norm": 0.18840709328651428,
"learning_rate": 6.828814909619374e-06,
"loss": 0.4057,
"step": 387
},
{
"epoch": 0.776,
"grad_norm": 0.19235002994537354,
"learning_rate": 6.814038526753205e-06,
"loss": 0.2826,
"step": 388
},
{
"epoch": 0.778,
"grad_norm": 0.1880473792552948,
"learning_rate": 6.799243876539213e-06,
"loss": 0.3739,
"step": 389
},
{
"epoch": 0.78,
"grad_norm": 0.29550889134407043,
"learning_rate": 6.78443110795936e-06,
"loss": 0.3594,
"step": 390
},
{
"epoch": 0.782,
"grad_norm": 0.19335615634918213,
"learning_rate": 6.76960037017806e-06,
"loss": 0.4026,
"step": 391
},
{
"epoch": 0.784,
"grad_norm": 0.14000019431114197,
"learning_rate": 6.75475181254068e-06,
"loss": 0.2576,
"step": 392
},
{
"epoch": 0.786,
"grad_norm": 0.15106743574142456,
"learning_rate": 6.739885584572026e-06,
"loss": 0.2538,
"step": 393
},
{
"epoch": 0.788,
"grad_norm": 0.19910076260566711,
"learning_rate": 6.725001835974854e-06,
"loss": 0.2867,
"step": 394
},
{
"epoch": 0.79,
"grad_norm": 0.22941169142723083,
"learning_rate": 6.710100716628345e-06,
"loss": 0.3183,
"step": 395
},
{
"epoch": 0.792,
"grad_norm": 0.1540730744600296,
"learning_rate": 6.695182376586603e-06,
"loss": 0.31,
"step": 396
},
{
"epoch": 0.794,
"grad_norm": 0.18420648574829102,
"learning_rate": 6.680246966077151e-06,
"loss": 0.388,
"step": 397
},
{
"epoch": 0.796,
"grad_norm": 0.14336371421813965,
"learning_rate": 6.665294635499404e-06,
"loss": 0.3359,
"step": 398
},
{
"epoch": 0.798,
"grad_norm": 0.21092049777507782,
"learning_rate": 6.650325535423166e-06,
"loss": 0.2935,
"step": 399
},
{
"epoch": 0.8,
"grad_norm": 0.23870034515857697,
"learning_rate": 6.635339816587109e-06,
"loss": 0.3413,
"step": 400
},
{
"epoch": 0.802,
"grad_norm": 0.21548299491405487,
"learning_rate": 6.6203376298972535e-06,
"loss": 0.4255,
"step": 401
},
{
"epoch": 0.804,
"grad_norm": 0.21555306017398834,
"learning_rate": 6.605319126425455e-06,
"loss": 0.4044,
"step": 402
},
{
"epoch": 0.806,
"grad_norm": 0.212354838848114,
"learning_rate": 6.590284457407876e-06,
"loss": 0.3225,
"step": 403
},
{
"epoch": 0.808,
"grad_norm": 0.17822064459323883,
"learning_rate": 6.5752337742434644e-06,
"loss": 0.3449,
"step": 404
},
{
"epoch": 0.81,
"grad_norm": 0.15272925794124603,
"learning_rate": 6.560167228492436e-06,
"loss": 0.2732,
"step": 405
},
{
"epoch": 0.812,
"grad_norm": 0.18225990235805511,
"learning_rate": 6.545084971874738e-06,
"loss": 0.3326,
"step": 406
},
{
"epoch": 0.814,
"grad_norm": 0.1854051798582077,
"learning_rate": 6.529987156268527e-06,
"loss": 0.3603,
"step": 407
},
{
"epoch": 0.816,
"grad_norm": 0.17678527534008026,
"learning_rate": 6.514873933708637e-06,
"loss": 0.2996,
"step": 408
},
{
"epoch": 0.818,
"grad_norm": 0.35500454902648926,
"learning_rate": 6.499745456385054e-06,
"loss": 0.4185,
"step": 409
},
{
"epoch": 0.82,
"grad_norm": 0.18555931746959686,
"learning_rate": 6.484601876641375e-06,
"loss": 0.2208,
"step": 410
},
{
"epoch": 0.822,
"grad_norm": 0.16834326088428497,
"learning_rate": 6.469443346973281e-06,
"loss": 0.3684,
"step": 411
},
{
"epoch": 0.824,
"grad_norm": 0.1469370424747467,
"learning_rate": 6.454270020026996e-06,
"loss": 0.2526,
"step": 412
},
{
"epoch": 0.826,
"grad_norm": 0.19754226505756378,
"learning_rate": 6.439082048597755e-06,
"loss": 0.3341,
"step": 413
},
{
"epoch": 0.828,
"grad_norm": 0.15154729783535004,
"learning_rate": 6.423879585628262e-06,
"loss": 0.2402,
"step": 414
},
{
"epoch": 0.83,
"grad_norm": 0.20265011489391327,
"learning_rate": 6.408662784207149e-06,
"loss": 0.374,
"step": 415
},
{
"epoch": 0.832,
"grad_norm": 0.2674030065536499,
"learning_rate": 6.39343179756744e-06,
"loss": 0.3057,
"step": 416
},
{
"epoch": 0.834,
"grad_norm": 0.1473691463470459,
"learning_rate": 6.378186779084996e-06,
"loss": 0.3684,
"step": 417
},
{
"epoch": 0.836,
"grad_norm": 0.2826951742172241,
"learning_rate": 6.362927882276991e-06,
"loss": 0.2585,
"step": 418
},
{
"epoch": 0.838,
"grad_norm": 0.20093302428722382,
"learning_rate": 6.34765526080034e-06,
"loss": 0.3041,
"step": 419
},
{
"epoch": 0.84,
"grad_norm": 0.1346312314271927,
"learning_rate": 6.332369068450175e-06,
"loss": 0.2105,
"step": 420
},
{
"epoch": 0.842,
"grad_norm": 0.16400040686130524,
"learning_rate": 6.317069459158284e-06,
"loss": 0.2832,
"step": 421
},
{
"epoch": 0.844,
"grad_norm": 0.19443334639072418,
"learning_rate": 6.301756586991561e-06,
"loss": 0.3353,
"step": 422
},
{
"epoch": 0.846,
"grad_norm": 0.22223643958568573,
"learning_rate": 6.286430606150458e-06,
"loss": 0.384,
"step": 423
},
{
"epoch": 0.848,
"grad_norm": 0.16762332618236542,
"learning_rate": 6.271091670967437e-06,
"loss": 0.3826,
"step": 424
},
{
"epoch": 0.85,
"grad_norm": 0.26455458998680115,
"learning_rate": 6.255739935905396e-06,
"loss": 0.4419,
"step": 425
},
{
"epoch": 0.852,
"grad_norm": 0.1570374071598053,
"learning_rate": 6.240375555556145e-06,
"loss": 0.2199,
"step": 426
},
{
"epoch": 0.854,
"grad_norm": 0.16800148785114288,
"learning_rate": 6.22499868463882e-06,
"loss": 0.2561,
"step": 427
},
{
"epoch": 0.856,
"grad_norm": 0.17082828283309937,
"learning_rate": 6.209609477998339e-06,
"loss": 0.3317,
"step": 428
},
{
"epoch": 0.858,
"grad_norm": 0.26214951276779175,
"learning_rate": 6.194208090603845e-06,
"loss": 0.4105,
"step": 429
},
{
"epoch": 0.86,
"grad_norm": 0.17318500578403473,
"learning_rate": 6.178794677547138e-06,
"loss": 0.2216,
"step": 430
},
{
"epoch": 0.862,
"grad_norm": 0.18394838273525238,
"learning_rate": 6.163369394041112e-06,
"loss": 0.3251,
"step": 431
},
{
"epoch": 0.864,
"grad_norm": 0.2352125197649002,
"learning_rate": 6.1479323954182055e-06,
"loss": 0.349,
"step": 432
},
{
"epoch": 0.866,
"grad_norm": 0.18627074360847473,
"learning_rate": 6.132483837128823e-06,
"loss": 0.3048,
"step": 433
},
{
"epoch": 0.868,
"grad_norm": 0.2253945916891098,
"learning_rate": 6.1170238747397715e-06,
"loss": 0.3081,
"step": 434
},
{
"epoch": 0.87,
"grad_norm": 0.1479015201330185,
"learning_rate": 6.101552663932704e-06,
"loss": 0.192,
"step": 435
},
{
"epoch": 0.872,
"grad_norm": 0.1954430192708969,
"learning_rate": 6.08607036050254e-06,
"loss": 0.2251,
"step": 436
},
{
"epoch": 0.874,
"grad_norm": 0.16169880330562592,
"learning_rate": 6.070577120355903e-06,
"loss": 0.2765,
"step": 437
},
{
"epoch": 0.876,
"grad_norm": 0.19537843763828278,
"learning_rate": 6.055073099509549e-06,
"loss": 0.2724,
"step": 438
},
{
"epoch": 0.878,
"grad_norm": 0.1675713211297989,
"learning_rate": 6.039558454088796e-06,
"loss": 0.3164,
"step": 439
},
{
"epoch": 0.88,
"grad_norm": 0.27977389097213745,
"learning_rate": 6.024033340325954e-06,
"loss": 0.4432,
"step": 440
},
{
"epoch": 0.882,
"grad_norm": 0.1879289448261261,
"learning_rate": 6.0084979145587444e-06,
"loss": 0.3558,
"step": 441
},
{
"epoch": 0.884,
"grad_norm": 0.16285355389118195,
"learning_rate": 5.9929523332287275e-06,
"loss": 0.3014,
"step": 442
},
{
"epoch": 0.886,
"grad_norm": 0.2135494202375412,
"learning_rate": 5.977396752879742e-06,
"loss": 0.3124,
"step": 443
},
{
"epoch": 0.888,
"grad_norm": 0.21992646157741547,
"learning_rate": 5.961831330156306e-06,
"loss": 0.3152,
"step": 444
},
{
"epoch": 0.89,
"grad_norm": 0.34824761748313904,
"learning_rate": 5.946256221802052e-06,
"loss": 0.4022,
"step": 445
},
{
"epoch": 0.892,
"grad_norm": 0.3176579177379608,
"learning_rate": 5.930671584658151e-06,
"loss": 0.3373,
"step": 446
},
{
"epoch": 0.894,
"grad_norm": 0.13881681859493256,
"learning_rate": 5.915077575661723e-06,
"loss": 0.2732,
"step": 447
},
{
"epoch": 0.896,
"grad_norm": 0.23585429787635803,
"learning_rate": 5.89947435184427e-06,
"loss": 0.383,
"step": 448
},
{
"epoch": 0.898,
"grad_norm": 0.20338225364685059,
"learning_rate": 5.883862070330079e-06,
"loss": 0.3929,
"step": 449
},
{
"epoch": 0.9,
"grad_norm": 0.5738399028778076,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.3834,
"step": 450
},
{
"epoch": 0.902,
"grad_norm": 0.16114148497581482,
"learning_rate": 5.85261096316312e-06,
"loss": 0.2351,
"step": 451
},
{
"epoch": 0.904,
"grad_norm": 0.16090261936187744,
"learning_rate": 5.8369724522086545e-06,
"loss": 0.2264,
"step": 452
},
{
"epoch": 0.906,
"grad_norm": 0.1992426961660385,
"learning_rate": 5.821325512950886e-06,
"loss": 0.3239,
"step": 453
},
{
"epoch": 0.908,
"grad_norm": 0.1780838966369629,
"learning_rate": 5.805670302954322e-06,
"loss": 0.2997,
"step": 454
},
{
"epoch": 0.91,
"grad_norm": 0.24148645997047424,
"learning_rate": 5.79000697986675e-06,
"loss": 0.3701,
"step": 455
},
{
"epoch": 0.912,
"grad_norm": 0.1544380933046341,
"learning_rate": 5.774335701417662e-06,
"loss": 0.1843,
"step": 456
},
{
"epoch": 0.914,
"grad_norm": 0.20772896707057953,
"learning_rate": 5.758656625416659e-06,
"loss": 0.3617,
"step": 457
},
{
"epoch": 0.916,
"grad_norm": 0.2054608017206192,
"learning_rate": 5.7429699097518585e-06,
"loss": 0.3286,
"step": 458
},
{
"epoch": 0.918,
"grad_norm": 0.1513553261756897,
"learning_rate": 5.727275712388318e-06,
"loss": 0.2149,
"step": 459
},
{
"epoch": 0.92,
"grad_norm": 0.20221109688282013,
"learning_rate": 5.711574191366427e-06,
"loss": 0.2895,
"step": 460
},
{
"epoch": 0.922,
"grad_norm": 0.26075002551078796,
"learning_rate": 5.695865504800328e-06,
"loss": 0.3115,
"step": 461
},
{
"epoch": 0.924,
"grad_norm": 0.2223353236913681,
"learning_rate": 5.680149810876322e-06,
"loss": 0.3065,
"step": 462
},
{
"epoch": 0.926,
"grad_norm": 0.18663600087165833,
"learning_rate": 5.664427267851271e-06,
"loss": 0.2444,
"step": 463
},
{
"epoch": 0.928,
"grad_norm": 0.19538210332393646,
"learning_rate": 5.648698034051009e-06,
"loss": 0.3877,
"step": 464
},
{
"epoch": 0.93,
"grad_norm": 0.1691403090953827,
"learning_rate": 5.632962267868747e-06,
"loss": 0.2445,
"step": 465
},
{
"epoch": 0.932,
"grad_norm": 0.1581772416830063,
"learning_rate": 5.617220127763474e-06,
"loss": 0.3217,
"step": 466
},
{
"epoch": 0.934,
"grad_norm": 0.20001822710037231,
"learning_rate": 5.601471772258368e-06,
"loss": 0.3184,
"step": 467
},
{
"epoch": 0.936,
"grad_norm": 0.3052047789096832,
"learning_rate": 5.585717359939192e-06,
"loss": 0.3479,
"step": 468
},
{
"epoch": 0.938,
"grad_norm": 0.23681974411010742,
"learning_rate": 5.569957049452703e-06,
"loss": 0.3403,
"step": 469
},
{
"epoch": 0.94,
"grad_norm": 0.12364782392978668,
"learning_rate": 5.5541909995050554e-06,
"loss": 0.2085,
"step": 470
},
{
"epoch": 0.942,
"grad_norm": 0.1526976227760315,
"learning_rate": 5.538419368860196e-06,
"loss": 0.2281,
"step": 471
},
{
"epoch": 0.944,
"grad_norm": 0.2230585813522339,
"learning_rate": 5.522642316338268e-06,
"loss": 0.3351,
"step": 472
},
{
"epoch": 0.946,
"grad_norm": 0.17690080404281616,
"learning_rate": 5.506860000814017e-06,
"loss": 0.2985,
"step": 473
},
{
"epoch": 0.948,
"grad_norm": 0.1738656908273697,
"learning_rate": 5.491072581215186e-06,
"loss": 0.247,
"step": 474
},
{
"epoch": 0.95,
"grad_norm": 0.18501204252243042,
"learning_rate": 5.475280216520913e-06,
"loss": 0.2646,
"step": 475
},
{
"epoch": 0.952,
"grad_norm": 0.19721092283725739,
"learning_rate": 5.459483065760138e-06,
"loss": 0.2876,
"step": 476
},
{
"epoch": 0.954,
"grad_norm": 0.16680027544498444,
"learning_rate": 5.443681288009991e-06,
"loss": 0.2167,
"step": 477
},
{
"epoch": 0.956,
"grad_norm": 0.17918136715888977,
"learning_rate": 5.4278750423942e-06,
"loss": 0.3997,
"step": 478
},
{
"epoch": 0.958,
"grad_norm": 0.15725551545619965,
"learning_rate": 5.412064488081482e-06,
"loss": 0.2829,
"step": 479
},
{
"epoch": 0.96,
"grad_norm": 0.19459596276283264,
"learning_rate": 5.396249784283943e-06,
"loss": 0.3373,
"step": 480
},
{
"epoch": 0.962,
"grad_norm": 0.32756415009498596,
"learning_rate": 5.380431090255475e-06,
"loss": 0.4206,
"step": 481
},
{
"epoch": 0.964,
"grad_norm": 0.19843968749046326,
"learning_rate": 5.364608565290154e-06,
"loss": 0.3385,
"step": 482
},
{
"epoch": 0.966,
"grad_norm": 0.15863648056983948,
"learning_rate": 5.348782368720627e-06,
"loss": 0.2524,
"step": 483
},
{
"epoch": 0.968,
"grad_norm": 0.21220897138118744,
"learning_rate": 5.33295265991652e-06,
"loss": 0.2326,
"step": 484
},
{
"epoch": 0.97,
"grad_norm": 0.24547149240970612,
"learning_rate": 5.317119598282823e-06,
"loss": 0.3854,
"step": 485
},
{
"epoch": 0.972,
"grad_norm": 0.2009747326374054,
"learning_rate": 5.301283343258293e-06,
"loss": 0.3141,
"step": 486
},
{
"epoch": 0.974,
"grad_norm": 0.22629286348819733,
"learning_rate": 5.285444054313841e-06,
"loss": 0.3044,
"step": 487
},
{
"epoch": 0.976,
"grad_norm": 0.18528909981250763,
"learning_rate": 5.26960189095093e-06,
"loss": 0.3056,
"step": 488
},
{
"epoch": 0.978,
"grad_norm": 0.18446871638298035,
"learning_rate": 5.253757012699972e-06,
"loss": 0.3206,
"step": 489
},
{
"epoch": 0.98,
"grad_norm": 0.1961178332567215,
"learning_rate": 5.237909579118713e-06,
"loss": 0.386,
"step": 490
},
{
"epoch": 0.982,
"grad_norm": 0.20445547997951508,
"learning_rate": 5.2220597497906315e-06,
"loss": 0.3997,
"step": 491
},
{
"epoch": 0.984,
"grad_norm": 0.17709751427173615,
"learning_rate": 5.206207684323337e-06,
"loss": 0.3212,
"step": 492
},
{
"epoch": 0.986,
"grad_norm": 0.15768595039844513,
"learning_rate": 5.190353542346951e-06,
"loss": 0.2752,
"step": 493
},
{
"epoch": 0.988,
"grad_norm": 0.14925841987133026,
"learning_rate": 5.174497483512506e-06,
"loss": 0.2593,
"step": 494
},
{
"epoch": 0.99,
"grad_norm": 0.2051381766796112,
"learning_rate": 5.15863966749034e-06,
"loss": 0.3941,
"step": 495
},
{
"epoch": 0.992,
"grad_norm": 0.2395932674407959,
"learning_rate": 5.142780253968481e-06,
"loss": 0.3136,
"step": 496
},
{
"epoch": 0.994,
"grad_norm": 0.2152215540409088,
"learning_rate": 5.126919402651053e-06,
"loss": 0.3083,
"step": 497
},
{
"epoch": 0.996,
"grad_norm": 0.17021948099136353,
"learning_rate": 5.111057273256648e-06,
"loss": 0.3185,
"step": 498
},
{
"epoch": 0.998,
"grad_norm": 0.22681966423988342,
"learning_rate": 5.095194025516733e-06,
"loss": 0.4107,
"step": 499
},
{
"epoch": 1.0,
"grad_norm": 0.22234933078289032,
"learning_rate": 5.07932981917404e-06,
"loss": 0.3672,
"step": 500
},
{
"epoch": 1.0,
"eval_loss": 0.27911150455474854,
"eval_runtime": 76.7158,
"eval_samples_per_second": 7.195,
"eval_steps_per_second": 0.899,
"step": 500
},
{
"epoch": 1.002,
"grad_norm": 0.18890836834907532,
"learning_rate": 5.063464813980948e-06,
"loss": 0.2277,
"step": 501
},
{
"epoch": 1.004,
"grad_norm": 0.19094686210155487,
"learning_rate": 5.0475991696978844e-06,
"loss": 0.3602,
"step": 502
},
{
"epoch": 1.006,
"grad_norm": 0.24123992025852203,
"learning_rate": 5.03173304609171e-06,
"loss": 0.2796,
"step": 503
},
{
"epoch": 1.008,
"grad_norm": 0.2091682106256485,
"learning_rate": 5.015866602934112e-06,
"loss": 0.333,
"step": 504
},
{
"epoch": 1.01,
"grad_norm": 0.21148917078971863,
"learning_rate": 5e-06,
"loss": 0.4005,
"step": 505
},
{
"epoch": 1.012,
"grad_norm": 0.14547854661941528,
"learning_rate": 4.984133397065889e-06,
"loss": 0.2223,
"step": 506
},
{
"epoch": 1.014,
"grad_norm": 0.23349957168102264,
"learning_rate": 4.9682669539082914e-06,
"loss": 0.3264,
"step": 507
},
{
"epoch": 1.016,
"grad_norm": 0.16822971403598785,
"learning_rate": 4.952400830302117e-06,
"loss": 0.3151,
"step": 508
},
{
"epoch": 1.018,
"grad_norm": 0.1795063018798828,
"learning_rate": 4.936535186019053e-06,
"loss": 0.2896,
"step": 509
},
{
"epoch": 1.02,
"grad_norm": 0.19863282144069672,
"learning_rate": 4.9206701808259605e-06,
"loss": 0.2481,
"step": 510
},
{
"epoch": 1.022,
"grad_norm": 0.18788766860961914,
"learning_rate": 4.904805974483267e-06,
"loss": 0.3513,
"step": 511
},
{
"epoch": 1.024,
"grad_norm": 0.1949293315410614,
"learning_rate": 4.888942726743353e-06,
"loss": 0.2264,
"step": 512
},
{
"epoch": 1.002,
"grad_norm": 0.16474653780460358,
"learning_rate": 4.873080597348948e-06,
"loss": 0.2793,
"step": 513
},
{
"epoch": 1.004,
"grad_norm": 0.20230461657047272,
"learning_rate": 4.85721974603152e-06,
"loss": 0.3618,
"step": 514
},
{
"epoch": 1.006,
"grad_norm": 0.16907107830047607,
"learning_rate": 4.841360332509663e-06,
"loss": 0.2708,
"step": 515
},
{
"epoch": 1.008,
"grad_norm": 0.22199520468711853,
"learning_rate": 4.825502516487497e-06,
"loss": 0.3405,
"step": 516
},
{
"epoch": 1.01,
"grad_norm": 0.17370116710662842,
"learning_rate": 4.809646457653051e-06,
"loss": 0.2715,
"step": 517
},
{
"epoch": 1.012,
"grad_norm": 0.21842899918556213,
"learning_rate": 4.793792315676665e-06,
"loss": 0.1802,
"step": 518
},
{
"epoch": 1.014,
"grad_norm": 0.1792248785495758,
"learning_rate": 4.777940250209369e-06,
"loss": 0.1912,
"step": 519
},
{
"epoch": 1.016,
"grad_norm": 0.24431253969669342,
"learning_rate": 4.762090420881289e-06,
"loss": 0.3494,
"step": 520
},
{
"epoch": 1.018,
"grad_norm": 0.1893794983625412,
"learning_rate": 4.74624298730003e-06,
"loss": 0.246,
"step": 521
},
{
"epoch": 1.02,
"grad_norm": 0.29100745916366577,
"learning_rate": 4.7303981090490715e-06,
"loss": 0.4553,
"step": 522
},
{
"epoch": 1.022,
"grad_norm": 0.21313871443271637,
"learning_rate": 4.71455594568616e-06,
"loss": 0.3414,
"step": 523
},
{
"epoch": 1.024,
"grad_norm": 0.257988840341568,
"learning_rate": 4.6987166567417085e-06,
"loss": 0.3223,
"step": 524
},
{
"epoch": 1.026,
"grad_norm": 0.1500207781791687,
"learning_rate": 4.682880401717178e-06,
"loss": 0.2883,
"step": 525
},
{
"epoch": 1.028,
"grad_norm": 0.2195630818605423,
"learning_rate": 4.667047340083481e-06,
"loss": 0.4185,
"step": 526
},
{
"epoch": 1.03,
"grad_norm": 0.24663732945919037,
"learning_rate": 4.651217631279374e-06,
"loss": 0.312,
"step": 527
},
{
"epoch": 1.032,
"grad_norm": 0.23168163001537323,
"learning_rate": 4.635391434709847e-06,
"loss": 0.3826,
"step": 528
},
{
"epoch": 1.034,
"grad_norm": 0.20334544777870178,
"learning_rate": 4.619568909744524e-06,
"loss": 0.302,
"step": 529
},
{
"epoch": 1.036,
"grad_norm": 0.2471403032541275,
"learning_rate": 4.603750215716057e-06,
"loss": 0.3024,
"step": 530
},
{
"epoch": 1.038,
"grad_norm": 0.19385652244091034,
"learning_rate": 4.587935511918521e-06,
"loss": 0.2803,
"step": 531
},
{
"epoch": 1.04,
"grad_norm": 0.24697639048099518,
"learning_rate": 4.572124957605803e-06,
"loss": 0.4114,
"step": 532
},
{
"epoch": 1.042,
"grad_norm": 0.24823316931724548,
"learning_rate": 4.55631871199001e-06,
"loss": 0.3705,
"step": 533
},
{
"epoch": 1.044,
"grad_norm": 0.1970013827085495,
"learning_rate": 4.5405169342398634e-06,
"loss": 0.3608,
"step": 534
},
{
"epoch": 1.046,
"grad_norm": 0.20955346524715424,
"learning_rate": 4.524719783479088e-06,
"loss": 0.347,
"step": 535
},
{
"epoch": 1.048,
"grad_norm": 0.1911235898733139,
"learning_rate": 4.5089274187848144e-06,
"loss": 0.2342,
"step": 536
},
{
"epoch": 1.05,
"grad_norm": 0.22940923273563385,
"learning_rate": 4.493139999185984e-06,
"loss": 0.2803,
"step": 537
},
{
"epoch": 1.052,
"grad_norm": 0.24347023665905,
"learning_rate": 4.477357683661734e-06,
"loss": 0.3833,
"step": 538
},
{
"epoch": 1.054,
"grad_norm": 0.24687382578849792,
"learning_rate": 4.461580631139806e-06,
"loss": 0.3467,
"step": 539
},
{
"epoch": 1.056,
"grad_norm": 0.15779221057891846,
"learning_rate": 4.445809000494945e-06,
"loss": 0.2781,
"step": 540
},
{
"epoch": 1.058,
"grad_norm": 0.20665578544139862,
"learning_rate": 4.430042950547298e-06,
"loss": 0.4656,
"step": 541
},
{
"epoch": 1.06,
"grad_norm": 0.24457348883152008,
"learning_rate": 4.414282640060809e-06,
"loss": 0.2684,
"step": 542
},
{
"epoch": 1.062,
"grad_norm": 0.20804962515830994,
"learning_rate": 4.398528227741634e-06,
"loss": 0.3577,
"step": 543
},
{
"epoch": 1.064,
"grad_norm": 0.2586953043937683,
"learning_rate": 4.382779872236527e-06,
"loss": 0.3492,
"step": 544
},
{
"epoch": 1.066,
"grad_norm": 0.26488688588142395,
"learning_rate": 4.367037732131254e-06,
"loss": 0.3954,
"step": 545
},
{
"epoch": 1.068,
"grad_norm": 0.15630888938903809,
"learning_rate": 4.3513019659489916e-06,
"loss": 0.1673,
"step": 546
},
{
"epoch": 1.07,
"grad_norm": 0.15465758740901947,
"learning_rate": 4.33557273214873e-06,
"loss": 0.2532,
"step": 547
},
{
"epoch": 1.072,
"grad_norm": 0.25680503249168396,
"learning_rate": 4.319850189123681e-06,
"loss": 0.3065,
"step": 548
},
{
"epoch": 1.074,
"grad_norm": 0.24224849045276642,
"learning_rate": 4.304134495199675e-06,
"loss": 0.4157,
"step": 549
},
{
"epoch": 1.076,
"grad_norm": 0.1849289834499359,
"learning_rate": 4.2884258086335755e-06,
"loss": 0.3611,
"step": 550
},
{
"epoch": 1.078,
"grad_norm": 0.2488396316766739,
"learning_rate": 4.272724287611684e-06,
"loss": 0.313,
"step": 551
},
{
"epoch": 1.08,
"grad_norm": 0.23535999655723572,
"learning_rate": 4.257030090248142e-06,
"loss": 0.3165,
"step": 552
},
{
"epoch": 1.082,
"grad_norm": 0.19105635583400726,
"learning_rate": 4.241343374583343e-06,
"loss": 0.2779,
"step": 553
},
{
"epoch": 1.084,
"grad_norm": 0.22108493745326996,
"learning_rate": 4.225664298582339e-06,
"loss": 0.3312,
"step": 554
},
{
"epoch": 1.086,
"grad_norm": 0.18127895891666412,
"learning_rate": 4.209993020133251e-06,
"loss": 0.2099,
"step": 555
},
{
"epoch": 1.088,
"grad_norm": 0.304030179977417,
"learning_rate": 4.194329697045681e-06,
"loss": 0.4397,
"step": 556
},
{
"epoch": 1.09,
"grad_norm": 0.16876006126403809,
"learning_rate": 4.178674487049116e-06,
"loss": 0.253,
"step": 557
},
{
"epoch": 1.092,
"grad_norm": 0.18693579733371735,
"learning_rate": 4.163027547791347e-06,
"loss": 0.2696,
"step": 558
},
{
"epoch": 1.094,
"grad_norm": 0.2209119349718094,
"learning_rate": 4.147389036836881e-06,
"loss": 0.2225,
"step": 559
},
{
"epoch": 1.096,
"grad_norm": 0.1712501347064972,
"learning_rate": 4.131759111665349e-06,
"loss": 0.2205,
"step": 560
},
{
"epoch": 1.098,
"grad_norm": 0.18427731096744537,
"learning_rate": 4.116137929669921e-06,
"loss": 0.2527,
"step": 561
},
{
"epoch": 1.1,
"grad_norm": 0.16298742592334747,
"learning_rate": 4.100525648155731e-06,
"loss": 0.2583,
"step": 562
},
{
"epoch": 1.102,
"grad_norm": 0.1921571046113968,
"learning_rate": 4.084922424338277e-06,
"loss": 0.2931,
"step": 563
},
{
"epoch": 1.104,
"grad_norm": 0.1696956604719162,
"learning_rate": 4.06932841534185e-06,
"loss": 0.2686,
"step": 564
},
{
"epoch": 1.106,
"grad_norm": 0.2463129460811615,
"learning_rate": 4.053743778197951e-06,
"loss": 0.301,
"step": 565
},
{
"epoch": 1.108,
"grad_norm": 0.15761299431324005,
"learning_rate": 4.038168669843698e-06,
"loss": 0.1756,
"step": 566
},
{
"epoch": 1.11,
"grad_norm": 0.1688557118177414,
"learning_rate": 4.02260324712026e-06,
"loss": 0.2969,
"step": 567
},
{
"epoch": 1.112,
"grad_norm": 0.21805354952812195,
"learning_rate": 4.007047666771274e-06,
"loss": 0.2739,
"step": 568
},
{
"epoch": 1.114,
"grad_norm": 0.17749401926994324,
"learning_rate": 3.991502085441259e-06,
"loss": 0.2698,
"step": 569
},
{
"epoch": 1.116,
"grad_norm": 0.2537892758846283,
"learning_rate": 3.975966659674048e-06,
"loss": 0.4131,
"step": 570
},
{
"epoch": 1.1179999999999999,
"grad_norm": 0.15672741830348969,
"learning_rate": 3.960441545911205e-06,
"loss": 0.2118,
"step": 571
},
{
"epoch": 1.12,
"grad_norm": 0.23960451781749725,
"learning_rate": 3.944926900490452e-06,
"loss": 0.2715,
"step": 572
},
{
"epoch": 1.1219999999999999,
"grad_norm": 0.17803031206130981,
"learning_rate": 3.929422879644099e-06,
"loss": 0.24,
"step": 573
},
{
"epoch": 1.124,
"grad_norm": 0.2676704525947571,
"learning_rate": 3.913929639497462e-06,
"loss": 0.3247,
"step": 574
},
{
"epoch": 1.126,
"grad_norm": 0.1522570550441742,
"learning_rate": 3.898447336067297e-06,
"loss": 0.2298,
"step": 575
},
{
"epoch": 1.1280000000000001,
"grad_norm": 0.23372875154018402,
"learning_rate": 3.882976125260229e-06,
"loss": 0.4375,
"step": 576
},
{
"epoch": 1.13,
"grad_norm": 0.3442481756210327,
"learning_rate": 3.867516162871177e-06,
"loss": 0.2883,
"step": 577
},
{
"epoch": 1.1320000000000001,
"grad_norm": 0.2335498332977295,
"learning_rate": 3.8520676045817945e-06,
"loss": 0.2602,
"step": 578
},
{
"epoch": 1.134,
"grad_norm": 0.29386457800865173,
"learning_rate": 3.8366306059588885e-06,
"loss": 0.3826,
"step": 579
},
{
"epoch": 1.1360000000000001,
"grad_norm": 0.18141314387321472,
"learning_rate": 3.821205322452863e-06,
"loss": 0.205,
"step": 580
},
{
"epoch": 1.138,
"grad_norm": 0.21235667169094086,
"learning_rate": 3.8057919093961554e-06,
"loss": 0.2511,
"step": 581
},
{
"epoch": 1.1400000000000001,
"grad_norm": 0.15281343460083008,
"learning_rate": 3.790390522001662e-06,
"loss": 0.1908,
"step": 582
},
{
"epoch": 1.142,
"grad_norm": 0.1883106231689453,
"learning_rate": 3.775001315361183e-06,
"loss": 0.2896,
"step": 583
},
{
"epoch": 1.144,
"grad_norm": 0.19878095388412476,
"learning_rate": 3.7596244444438577e-06,
"loss": 0.2847,
"step": 584
},
{
"epoch": 1.146,
"grad_norm": 0.18822817504405975,
"learning_rate": 3.7442600640946045e-06,
"loss": 0.3134,
"step": 585
},
{
"epoch": 1.148,
"grad_norm": 0.21552503108978271,
"learning_rate": 3.7289083290325668e-06,
"loss": 0.3323,
"step": 586
},
{
"epoch": 1.15,
"grad_norm": 0.25933748483657837,
"learning_rate": 3.7135693938495433e-06,
"loss": 0.3463,
"step": 587
},
{
"epoch": 1.152,
"grad_norm": 0.23867465555667877,
"learning_rate": 3.69824341300844e-06,
"loss": 0.3601,
"step": 588
},
{
"epoch": 1.154,
"grad_norm": 0.3167083263397217,
"learning_rate": 3.682930540841717e-06,
"loss": 0.4182,
"step": 589
},
{
"epoch": 1.156,
"grad_norm": 0.31397873163223267,
"learning_rate": 3.667630931549826e-06,
"loss": 0.3287,
"step": 590
},
{
"epoch": 1.158,
"grad_norm": 0.18764562904834747,
"learning_rate": 3.6523447391996613e-06,
"loss": 0.276,
"step": 591
},
{
"epoch": 1.16,
"grad_norm": 0.29411885142326355,
"learning_rate": 3.637072117723012e-06,
"loss": 0.3956,
"step": 592
},
{
"epoch": 1.162,
"grad_norm": 0.19027218222618103,
"learning_rate": 3.6218132209150047e-06,
"loss": 0.2753,
"step": 593
},
{
"epoch": 1.164,
"grad_norm": 0.20175009965896606,
"learning_rate": 3.606568202432562e-06,
"loss": 0.3459,
"step": 594
},
{
"epoch": 1.166,
"grad_norm": 0.2005695253610611,
"learning_rate": 3.5913372157928515e-06,
"loss": 0.2125,
"step": 595
},
{
"epoch": 1.168,
"grad_norm": 0.22972247004508972,
"learning_rate": 3.5761204143717387e-06,
"loss": 0.2925,
"step": 596
},
{
"epoch": 1.17,
"grad_norm": 0.22252865135669708,
"learning_rate": 3.560917951402245e-06,
"loss": 0.3467,
"step": 597
},
{
"epoch": 1.172,
"grad_norm": 0.2404780089855194,
"learning_rate": 3.5457299799730047e-06,
"loss": 0.3268,
"step": 598
},
{
"epoch": 1.174,
"grad_norm": 0.24187296628952026,
"learning_rate": 3.5305566530267217e-06,
"loss": 0.3654,
"step": 599
},
{
"epoch": 1.176,
"grad_norm": 0.23365625739097595,
"learning_rate": 3.5153981233586277e-06,
"loss": 0.3168,
"step": 600
},
{
"epoch": 1.178,
"grad_norm": 0.20350268483161926,
"learning_rate": 3.5002545436149478e-06,
"loss": 0.2618,
"step": 601
},
{
"epoch": 1.18,
"grad_norm": 0.22084195911884308,
"learning_rate": 3.4851260662913643e-06,
"loss": 0.381,
"step": 602
},
{
"epoch": 1.182,
"grad_norm": 0.5043354630470276,
"learning_rate": 3.470012843731476e-06,
"loss": 0.426,
"step": 603
},
{
"epoch": 1.184,
"grad_norm": 0.23615571856498718,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.3891,
"step": 604
},
{
"epoch": 1.186,
"grad_norm": 0.1776285469532013,
"learning_rate": 3.439832771507565e-06,
"loss": 0.2032,
"step": 605
},
{
"epoch": 1.188,
"grad_norm": 0.23352046310901642,
"learning_rate": 3.4247662257565372e-06,
"loss": 0.2098,
"step": 606
},
{
"epoch": 1.19,
"grad_norm": 0.19145451486110687,
"learning_rate": 3.4097155425921256e-06,
"loss": 0.2612,
"step": 607
},
{
"epoch": 1.192,
"grad_norm": 0.19671331346035004,
"learning_rate": 3.394680873574546e-06,
"loss": 0.2941,
"step": 608
},
{
"epoch": 1.194,
"grad_norm": 0.2002706378698349,
"learning_rate": 3.3796623701027477e-06,
"loss": 0.1828,
"step": 609
},
{
"epoch": 1.196,
"grad_norm": 0.23058104515075684,
"learning_rate": 3.3646601834128924e-06,
"loss": 0.2983,
"step": 610
},
{
"epoch": 1.198,
"grad_norm": 0.13006491959095,
"learning_rate": 3.349674464576834e-06,
"loss": 0.1306,
"step": 611
},
{
"epoch": 1.2,
"grad_norm": 0.29587817192077637,
"learning_rate": 3.3347053645005965e-06,
"loss": 0.3542,
"step": 612
},
{
"epoch": 1.202,
"grad_norm": 0.23100513219833374,
"learning_rate": 3.319753033922849e-06,
"loss": 0.4051,
"step": 613
},
{
"epoch": 1.204,
"grad_norm": 0.24775229394435883,
"learning_rate": 3.3048176234133967e-06,
"loss": 0.2378,
"step": 614
},
{
"epoch": 1.206,
"grad_norm": 0.18648101389408112,
"learning_rate": 3.289899283371657e-06,
"loss": 0.2141,
"step": 615
},
{
"epoch": 1.208,
"grad_norm": 0.24682392179965973,
"learning_rate": 3.274998164025148e-06,
"loss": 0.3123,
"step": 616
},
{
"epoch": 1.21,
"grad_norm": 0.25237175822257996,
"learning_rate": 3.260114415427975e-06,
"loss": 0.4471,
"step": 617
},
{
"epoch": 1.212,
"grad_norm": 0.20262058079242706,
"learning_rate": 3.2452481874593234e-06,
"loss": 0.2694,
"step": 618
},
{
"epoch": 1.214,
"grad_norm": 0.23342056572437286,
"learning_rate": 3.230399629821942e-06,
"loss": 0.3093,
"step": 619
},
{
"epoch": 1.216,
"grad_norm": 0.17575059831142426,
"learning_rate": 3.2155688920406415e-06,
"loss": 0.2923,
"step": 620
},
{
"epoch": 1.218,
"grad_norm": 0.2357223480939865,
"learning_rate": 3.200756123460788e-06,
"loss": 0.3569,
"step": 621
},
{
"epoch": 1.22,
"grad_norm": 0.3179761469364166,
"learning_rate": 3.1859614732467957e-06,
"loss": 0.4442,
"step": 622
},
{
"epoch": 1.222,
"grad_norm": 0.28770139813423157,
"learning_rate": 3.171185090380628e-06,
"loss": 0.3325,
"step": 623
},
{
"epoch": 1.224,
"grad_norm": 0.18547223508358002,
"learning_rate": 3.156427123660297e-06,
"loss": 0.2269,
"step": 624
},
{
"epoch": 1.226,
"grad_norm": 0.21385949850082397,
"learning_rate": 3.141687721698363e-06,
"loss": 0.2615,
"step": 625
},
{
"epoch": 1.226,
"eval_loss": 0.2700715959072113,
"eval_runtime": 76.6157,
"eval_samples_per_second": 7.205,
"eval_steps_per_second": 0.901,
"step": 625
},
{
"epoch": 1.228,
"grad_norm": 0.3386872708797455,
"learning_rate": 3.12696703292044e-06,
"loss": 0.3519,
"step": 626
},
{
"epoch": 1.23,
"grad_norm": 0.19794243574142456,
"learning_rate": 3.1122652055637014e-06,
"loss": 0.2581,
"step": 627
},
{
"epoch": 1.232,
"grad_norm": 0.1912515014410019,
"learning_rate": 3.097582387675385e-06,
"loss": 0.3286,
"step": 628
},
{
"epoch": 1.234,
"grad_norm": 0.18073877692222595,
"learning_rate": 3.0829187271113035e-06,
"loss": 0.2411,
"step": 629
},
{
"epoch": 1.236,
"grad_norm": 0.24173890054225922,
"learning_rate": 3.0682743715343565e-06,
"loss": 0.3853,
"step": 630
},
{
"epoch": 1.238,
"grad_norm": 0.17611730098724365,
"learning_rate": 3.053649468413043e-06,
"loss": 0.1971,
"step": 631
},
{
"epoch": 1.24,
"grad_norm": 0.22723500430583954,
"learning_rate": 3.0390441650199727e-06,
"loss": 0.2852,
"step": 632
},
{
"epoch": 1.242,
"grad_norm": 0.2124418169260025,
"learning_rate": 3.0244586084303908e-06,
"loss": 0.329,
"step": 633
},
{
"epoch": 1.244,
"grad_norm": 0.24569527804851532,
"learning_rate": 3.0098929455206905e-06,
"loss": 0.4141,
"step": 634
},
{
"epoch": 1.246,
"grad_norm": 0.2651529312133789,
"learning_rate": 2.995347322966933e-06,
"loss": 0.2759,
"step": 635
},
{
"epoch": 1.248,
"grad_norm": 0.3110187351703644,
"learning_rate": 2.980821887243377e-06,
"loss": 0.3405,
"step": 636
},
{
"epoch": 1.25,
"grad_norm": 0.23818974196910858,
"learning_rate": 2.966316784621e-06,
"loss": 0.2185,
"step": 637
},
{
"epoch": 1.252,
"grad_norm": 0.32177677750587463,
"learning_rate": 2.951832161166024e-06,
"loss": 0.4972,
"step": 638
},
{
"epoch": 1.254,
"grad_norm": 0.21647526323795319,
"learning_rate": 2.937368162738445e-06,
"loss": 0.4215,
"step": 639
},
{
"epoch": 1.256,
"grad_norm": 0.1766624003648758,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.2439,
"step": 640
},
{
"epoch": 1.258,
"grad_norm": 0.34441429376602173,
"learning_rate": 2.9085026233655367e-06,
"loss": 0.4078,
"step": 641
},
{
"epoch": 1.26,
"grad_norm": 0.30576056241989136,
"learning_rate": 2.8941013730958674e-06,
"loss": 0.4071,
"step": 642
},
{
"epoch": 1.262,
"grad_norm": 0.22246578335762024,
"learning_rate": 2.8797213292019927e-06,
"loss": 0.3456,
"step": 643
},
{
"epoch": 1.264,
"grad_norm": 0.21253855526447296,
"learning_rate": 2.8653626364907918e-06,
"loss": 0.2257,
"step": 644
},
{
"epoch": 1.266,
"grad_norm": 0.22427724301815033,
"learning_rate": 2.851025439554142e-06,
"loss": 0.298,
"step": 645
},
{
"epoch": 1.268,
"grad_norm": 0.19472835958003998,
"learning_rate": 2.8367098827674575e-06,
"loss": 0.3093,
"step": 646
},
{
"epoch": 1.27,
"grad_norm": 0.19399920105934143,
"learning_rate": 2.82241611028824e-06,
"loss": 0.2254,
"step": 647
},
{
"epoch": 1.272,
"grad_norm": 0.23820382356643677,
"learning_rate": 2.8081442660546126e-06,
"loss": 0.2909,
"step": 648
},
{
"epoch": 1.274,
"grad_norm": 0.1856381893157959,
"learning_rate": 2.7938944937838924e-06,
"loss": 0.2367,
"step": 649
},
{
"epoch": 1.276,
"grad_norm": 0.16763170063495636,
"learning_rate": 2.7796669369711294e-06,
"loss": 0.1991,
"step": 650
},
{
"epoch": 1.278,
"grad_norm": 0.25936460494995117,
"learning_rate": 2.7654617388876612e-06,
"loss": 0.3244,
"step": 651
},
{
"epoch": 1.28,
"grad_norm": 0.37680599093437195,
"learning_rate": 2.751279042579672e-06,
"loss": 0.409,
"step": 652
},
{
"epoch": 1.282,
"grad_norm": 0.2094666063785553,
"learning_rate": 2.7371189908667604e-06,
"loss": 0.3523,
"step": 653
},
{
"epoch": 1.284,
"grad_norm": 0.25615018606185913,
"learning_rate": 2.722981726340487e-06,
"loss": 0.3496,
"step": 654
},
{
"epoch": 1.286,
"grad_norm": 0.2155938446521759,
"learning_rate": 2.708867391362948e-06,
"loss": 0.2099,
"step": 655
},
{
"epoch": 1.288,
"grad_norm": 0.2571382522583008,
"learning_rate": 2.694776128065345e-06,
"loss": 0.2505,
"step": 656
},
{
"epoch": 1.29,
"grad_norm": 0.25513583421707153,
"learning_rate": 2.6807080783465376e-06,
"loss": 0.3528,
"step": 657
},
{
"epoch": 1.292,
"grad_norm": 0.21190734207630157,
"learning_rate": 2.6666633838716317e-06,
"loss": 0.3892,
"step": 658
},
{
"epoch": 1.294,
"grad_norm": 0.2990153133869171,
"learning_rate": 2.6526421860705474e-06,
"loss": 0.3916,
"step": 659
},
{
"epoch": 1.296,
"grad_norm": 0.22129324078559875,
"learning_rate": 2.6386446261365874e-06,
"loss": 0.2596,
"step": 660
},
{
"epoch": 1.298,
"grad_norm": 0.2187465876340866,
"learning_rate": 2.6246708450250256e-06,
"loss": 0.3962,
"step": 661
},
{
"epoch": 1.3,
"grad_norm": 0.17136049270629883,
"learning_rate": 2.6107209834516857e-06,
"loss": 0.3483,
"step": 662
},
{
"epoch": 1.302,
"grad_norm": 0.25110378861427307,
"learning_rate": 2.5967951818915137e-06,
"loss": 0.4098,
"step": 663
},
{
"epoch": 1.304,
"grad_norm": 0.3335612118244171,
"learning_rate": 2.5828935805771804e-06,
"loss": 0.3407,
"step": 664
},
{
"epoch": 1.306,
"grad_norm": 0.23392237722873688,
"learning_rate": 2.5690163194976576e-06,
"loss": 0.3893,
"step": 665
},
{
"epoch": 1.308,
"grad_norm": 0.21025826036930084,
"learning_rate": 2.5551635383968063e-06,
"loss": 0.3047,
"step": 666
},
{
"epoch": 1.31,
"grad_norm": 0.20678383111953735,
"learning_rate": 2.5413353767719805e-06,
"loss": 0.3068,
"step": 667
},
{
"epoch": 1.312,
"grad_norm": 0.255937397480011,
"learning_rate": 2.527531973872617e-06,
"loss": 0.2963,
"step": 668
},
{
"epoch": 1.314,
"grad_norm": 0.3448125422000885,
"learning_rate": 2.5137534686988265e-06,
"loss": 0.3944,
"step": 669
},
{
"epoch": 1.316,
"grad_norm": 0.21276655793190002,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.2955,
"step": 670
},
{
"epoch": 1.318,
"grad_norm": 0.2522459030151367,
"learning_rate": 2.486271706273421e-06,
"loss": 0.3536,
"step": 671
},
{
"epoch": 1.32,
"grad_norm": 0.2182285189628601,
"learning_rate": 2.4725687257628533e-06,
"loss": 0.3541,
"step": 672
},
{
"epoch": 1.322,
"grad_norm": 0.30204272270202637,
"learning_rate": 2.4588911964571557e-06,
"loss": 0.268,
"step": 673
},
{
"epoch": 1.324,
"grad_norm": 0.27727144956588745,
"learning_rate": 2.445239256088898e-06,
"loss": 0.3061,
"step": 674
},
{
"epoch": 1.326,
"grad_norm": 0.22263972461223602,
"learning_rate": 2.4316130421329696e-06,
"loss": 0.3317,
"step": 675
},
{
"epoch": 1.328,
"grad_norm": 0.23461495339870453,
"learning_rate": 2.418012691805191e-06,
"loss": 0.3153,
"step": 676
},
{
"epoch": 1.33,
"grad_norm": 0.1453184336423874,
"learning_rate": 2.404438342060941e-06,
"loss": 0.1933,
"step": 677
},
{
"epoch": 1.332,
"grad_norm": 0.20232437551021576,
"learning_rate": 2.3908901295937713e-06,
"loss": 0.1941,
"step": 678
},
{
"epoch": 1.334,
"grad_norm": 0.23894034326076508,
"learning_rate": 2.3773681908340284e-06,
"loss": 0.3198,
"step": 679
},
{
"epoch": 1.336,
"grad_norm": 0.3079819977283478,
"learning_rate": 2.363872661947488e-06,
"loss": 0.3761,
"step": 680
},
{
"epoch": 1.338,
"grad_norm": 0.20794443786144257,
"learning_rate": 2.3504036788339763e-06,
"loss": 0.3837,
"step": 681
},
{
"epoch": 1.34,
"grad_norm": 0.2881450057029724,
"learning_rate": 2.3369613771260006e-06,
"loss": 0.2904,
"step": 682
},
{
"epoch": 1.342,
"grad_norm": 0.20050355792045593,
"learning_rate": 2.323545892187393e-06,
"loss": 0.2323,
"step": 683
},
{
"epoch": 1.3439999999999999,
"grad_norm": 0.22167599201202393,
"learning_rate": 2.310157359111938e-06,
"loss": 0.2501,
"step": 684
},
{
"epoch": 1.346,
"grad_norm": 0.29652273654937744,
"learning_rate": 2.296795912722014e-06,
"loss": 0.3702,
"step": 685
},
{
"epoch": 1.3479999999999999,
"grad_norm": 0.20178988575935364,
"learning_rate": 2.2834616875672362e-06,
"loss": 0.2581,
"step": 686
},
{
"epoch": 1.35,
"grad_norm": 0.25368136167526245,
"learning_rate": 2.2701548179231048e-06,
"loss": 0.3034,
"step": 687
},
{
"epoch": 1.3519999999999999,
"grad_norm": 0.20186640322208405,
"learning_rate": 2.2568754377896516e-06,
"loss": 0.2991,
"step": 688
},
{
"epoch": 1.354,
"grad_norm": 0.2289544939994812,
"learning_rate": 2.2436236808900846e-06,
"loss": 0.3188,
"step": 689
},
{
"epoch": 1.3559999999999999,
"grad_norm": 0.2351309210062027,
"learning_rate": 2.230399680669449e-06,
"loss": 0.2942,
"step": 690
},
{
"epoch": 1.358,
"grad_norm": 0.19411875307559967,
"learning_rate": 2.2172035702932828e-06,
"loss": 0.3415,
"step": 691
},
{
"epoch": 1.3599999999999999,
"grad_norm": 0.23344936966896057,
"learning_rate": 2.204035482646267e-06,
"loss": 0.2904,
"step": 692
},
{
"epoch": 1.362,
"grad_norm": 0.17623913288116455,
"learning_rate": 2.190895550330899e-06,
"loss": 0.1493,
"step": 693
},
{
"epoch": 1.3639999999999999,
"grad_norm": 0.22438128292560577,
"learning_rate": 2.1777839056661555e-06,
"loss": 0.3669,
"step": 694
},
{
"epoch": 1.366,
"grad_norm": 0.25720444321632385,
"learning_rate": 2.1647006806861472e-06,
"loss": 0.4394,
"step": 695
},
{
"epoch": 1.3679999999999999,
"grad_norm": 0.17176856100559235,
"learning_rate": 2.1516460071388062e-06,
"loss": 0.2309,
"step": 696
},
{
"epoch": 1.37,
"grad_norm": 0.26110807061195374,
"learning_rate": 2.1386200164845527e-06,
"loss": 0.4329,
"step": 697
},
{
"epoch": 1.3719999999999999,
"grad_norm": 0.24240969121456146,
"learning_rate": 2.125622839894964e-06,
"loss": 0.2596,
"step": 698
},
{
"epoch": 1.374,
"grad_norm": 0.202704519033432,
"learning_rate": 2.1126546082514665e-06,
"loss": 0.2737,
"step": 699
},
{
"epoch": 1.376,
"grad_norm": 0.20342108607292175,
"learning_rate": 2.09971545214401e-06,
"loss": 0.2692,
"step": 700
},
{
"epoch": 1.3780000000000001,
"grad_norm": 0.3197811543941498,
"learning_rate": 2.086805501869749e-06,
"loss": 0.3117,
"step": 701
},
{
"epoch": 1.38,
"grad_norm": 0.29925206303596497,
"learning_rate": 2.073924887431744e-06,
"loss": 0.2391,
"step": 702
},
{
"epoch": 1.3820000000000001,
"grad_norm": 0.2412380427122116,
"learning_rate": 2.061073738537635e-06,
"loss": 0.2434,
"step": 703
},
{
"epoch": 1.384,
"grad_norm": 0.25253570079803467,
"learning_rate": 2.0482521845983522e-06,
"loss": 0.3284,
"step": 704
},
{
"epoch": 1.3860000000000001,
"grad_norm": 0.18548652529716492,
"learning_rate": 2.0354603547267985e-06,
"loss": 0.2562,
"step": 705
},
{
"epoch": 1.388,
"grad_norm": 0.2307010442018509,
"learning_rate": 2.0226983777365604e-06,
"loss": 0.2445,
"step": 706
},
{
"epoch": 1.3900000000000001,
"grad_norm": 0.1840142160654068,
"learning_rate": 2.009966382140606e-06,
"loss": 0.3521,
"step": 707
},
{
"epoch": 1.392,
"grad_norm": 0.2078990340232849,
"learning_rate": 1.9972644961499853e-06,
"loss": 0.2887,
"step": 708
},
{
"epoch": 1.3940000000000001,
"grad_norm": 0.20442235469818115,
"learning_rate": 1.9845928476725522e-06,
"loss": 0.3453,
"step": 709
},
{
"epoch": 1.396,
"grad_norm": 0.1933489441871643,
"learning_rate": 1.971951564311668e-06,
"loss": 0.3581,
"step": 710
},
{
"epoch": 1.3980000000000001,
"grad_norm": 0.19691258668899536,
"learning_rate": 1.959340773364911e-06,
"loss": 0.2933,
"step": 711
},
{
"epoch": 1.4,
"grad_norm": 0.1842382252216339,
"learning_rate": 1.946760601822809e-06,
"loss": 0.2894,
"step": 712
},
{
"epoch": 1.4020000000000001,
"grad_norm": 0.35139110684394836,
"learning_rate": 1.9342111763675512e-06,
"loss": 0.3405,
"step": 713
},
{
"epoch": 1.404,
"grad_norm": 0.19070106744766235,
"learning_rate": 1.9216926233717087e-06,
"loss": 0.213,
"step": 714
},
{
"epoch": 1.4060000000000001,
"grad_norm": 0.20061296224594116,
"learning_rate": 1.9092050688969736e-06,
"loss": 0.2858,
"step": 715
},
{
"epoch": 1.408,
"grad_norm": 0.30167287588119507,
"learning_rate": 1.8967486386928819e-06,
"loss": 0.4004,
"step": 716
},
{
"epoch": 1.41,
"grad_norm": 0.21128444373607635,
"learning_rate": 1.8843234581955444e-06,
"loss": 0.2326,
"step": 717
},
{
"epoch": 1.412,
"grad_norm": 0.23791776597499847,
"learning_rate": 1.8719296525263925e-06,
"loss": 0.2337,
"step": 718
},
{
"epoch": 1.414,
"grad_norm": 0.27308812737464905,
"learning_rate": 1.859567346490913e-06,
"loss": 0.2667,
"step": 719
},
{
"epoch": 1.416,
"grad_norm": 0.19012384116649628,
"learning_rate": 1.8472366645773892e-06,
"loss": 0.2042,
"step": 720
},
{
"epoch": 1.418,
"grad_norm": 0.2819920480251312,
"learning_rate": 1.8349377309556487e-06,
"loss": 0.3546,
"step": 721
},
{
"epoch": 1.42,
"grad_norm": 0.16963627934455872,
"learning_rate": 1.8226706694758194e-06,
"loss": 0.2087,
"step": 722
},
{
"epoch": 1.422,
"grad_norm": 0.222882941365242,
"learning_rate": 1.810435603667075e-06,
"loss": 0.3519,
"step": 723
},
{
"epoch": 1.424,
"grad_norm": 0.200264573097229,
"learning_rate": 1.798232656736389e-06,
"loss": 0.2172,
"step": 724
},
{
"epoch": 1.426,
"grad_norm": 0.25277942419052124,
"learning_rate": 1.7860619515673034e-06,
"loss": 0.3984,
"step": 725
},
{
"epoch": 1.428,
"grad_norm": 0.24608227610588074,
"learning_rate": 1.7739236107186858e-06,
"loss": 0.2575,
"step": 726
},
{
"epoch": 1.43,
"grad_norm": 0.30379989743232727,
"learning_rate": 1.7618177564234907e-06,
"loss": 0.2949,
"step": 727
},
{
"epoch": 1.432,
"grad_norm": 0.15659303963184357,
"learning_rate": 1.7497445105875377e-06,
"loss": 0.1913,
"step": 728
},
{
"epoch": 1.434,
"grad_norm": 0.2043537199497223,
"learning_rate": 1.7377039947882802e-06,
"loss": 0.2716,
"step": 729
},
{
"epoch": 1.436,
"grad_norm": 0.20367324352264404,
"learning_rate": 1.7256963302735752e-06,
"loss": 0.2358,
"step": 730
},
{
"epoch": 1.438,
"grad_norm": 0.28134340047836304,
"learning_rate": 1.7137216379604727e-06,
"loss": 0.2814,
"step": 731
},
{
"epoch": 1.44,
"grad_norm": 0.2837545871734619,
"learning_rate": 1.7017800384339928e-06,
"loss": 0.3792,
"step": 732
},
{
"epoch": 1.442,
"grad_norm": 0.22841040790081024,
"learning_rate": 1.6898716519459074e-06,
"loss": 0.2819,
"step": 733
},
{
"epoch": 1.444,
"grad_norm": 0.21164868772029877,
"learning_rate": 1.6779965984135376e-06,
"loss": 0.2676,
"step": 734
},
{
"epoch": 1.446,
"grad_norm": 0.2656158208847046,
"learning_rate": 1.6661549974185426e-06,
"loss": 0.284,
"step": 735
},
{
"epoch": 1.448,
"grad_norm": 0.2675846815109253,
"learning_rate": 1.6543469682057105e-06,
"loss": 0.3098,
"step": 736
},
{
"epoch": 1.45,
"grad_norm": 0.2900715172290802,
"learning_rate": 1.6425726296817634e-06,
"loss": 0.3378,
"step": 737
},
{
"epoch": 1.452,
"grad_norm": 0.27534744143486023,
"learning_rate": 1.6308321004141609e-06,
"loss": 0.3497,
"step": 738
},
{
"epoch": 1.454,
"grad_norm": 0.30499523878097534,
"learning_rate": 1.6191254986299044e-06,
"loss": 0.3271,
"step": 739
},
{
"epoch": 1.456,
"grad_norm": 0.1775362193584442,
"learning_rate": 1.6074529422143398e-06,
"loss": 0.1754,
"step": 740
},
{
"epoch": 1.458,
"grad_norm": 0.25734683871269226,
"learning_rate": 1.5958145487099829e-06,
"loss": 0.3568,
"step": 741
},
{
"epoch": 1.46,
"grad_norm": 0.22716552019119263,
"learning_rate": 1.5842104353153286e-06,
"loss": 0.2856,
"step": 742
},
{
"epoch": 1.462,
"grad_norm": 0.2042451947927475,
"learning_rate": 1.5726407188836672e-06,
"loss": 0.2623,
"step": 743
},
{
"epoch": 1.464,
"grad_norm": 0.26923978328704834,
"learning_rate": 1.561105515921915e-06,
"loss": 0.4326,
"step": 744
},
{
"epoch": 1.466,
"grad_norm": 0.22442659735679626,
"learning_rate": 1.549604942589441e-06,
"loss": 0.2867,
"step": 745
},
{
"epoch": 1.468,
"grad_norm": 0.16880613565444946,
"learning_rate": 1.5381391146968866e-06,
"loss": 0.1821,
"step": 746
},
{
"epoch": 1.47,
"grad_norm": 0.24349483847618103,
"learning_rate": 1.5267081477050132e-06,
"loss": 0.2753,
"step": 747
},
{
"epoch": 1.472,
"grad_norm": 0.27072674036026,
"learning_rate": 1.5153121567235334e-06,
"loss": 0.2222,
"step": 748
},
{
"epoch": 1.474,
"grad_norm": 0.291255921125412,
"learning_rate": 1.5039512565099468e-06,
"loss": 0.3485,
"step": 749
},
{
"epoch": 1.476,
"grad_norm": 0.20078301429748535,
"learning_rate": 1.4926255614683931e-06,
"loss": 0.2959,
"step": 750
},
{
"epoch": 1.476,
"eval_loss": 0.2654268741607666,
"eval_runtime": 76.2376,
"eval_samples_per_second": 7.241,
"eval_steps_per_second": 0.905,
"step": 750
},
{
"epoch": 1.478,
"grad_norm": 0.2795911431312561,
"learning_rate": 1.4813351856484981e-06,
"loss": 0.1859,
"step": 751
},
{
"epoch": 1.48,
"grad_norm": 0.35663336515426636,
"learning_rate": 1.470080242744218e-06,
"loss": 0.3358,
"step": 752
},
{
"epoch": 1.482,
"grad_norm": 0.23237483203411102,
"learning_rate": 1.458860846092705e-06,
"loss": 0.2874,
"step": 753
},
{
"epoch": 1.484,
"grad_norm": 0.19958510994911194,
"learning_rate": 1.4476771086731567e-06,
"loss": 0.3507,
"step": 754
},
{
"epoch": 1.486,
"grad_norm": 0.22077733278274536,
"learning_rate": 1.4365291431056871e-06,
"loss": 0.3085,
"step": 755
},
{
"epoch": 1.488,
"grad_norm": 0.31041693687438965,
"learning_rate": 1.4254170616501828e-06,
"loss": 0.3724,
"step": 756
},
{
"epoch": 1.49,
"grad_norm": 0.18345925211906433,
"learning_rate": 1.4143409762051829e-06,
"loss": 0.1957,
"step": 757
},
{
"epoch": 1.492,
"grad_norm": 0.1973162293434143,
"learning_rate": 1.4033009983067454e-06,
"loss": 0.2304,
"step": 758
},
{
"epoch": 1.494,
"grad_norm": 0.2636561095714569,
"learning_rate": 1.3922972391273226e-06,
"loss": 0.3215,
"step": 759
},
{
"epoch": 1.496,
"grad_norm": 0.22231453657150269,
"learning_rate": 1.3813298094746491e-06,
"loss": 0.2346,
"step": 760
},
{
"epoch": 1.498,
"grad_norm": 0.21096548438072205,
"learning_rate": 1.3703988197906209e-06,
"loss": 0.297,
"step": 761
},
{
"epoch": 1.5,
"grad_norm": 0.29171353578567505,
"learning_rate": 1.3595043801501794e-06,
"loss": 0.362,
"step": 762
},
{
"epoch": 1.502,
"grad_norm": 0.2302405834197998,
"learning_rate": 1.3486466002602133e-06,
"loss": 0.3468,
"step": 763
},
{
"epoch": 1.504,
"grad_norm": 0.1669236272573471,
"learning_rate": 1.3378255894584463e-06,
"loss": 0.2525,
"step": 764
},
{
"epoch": 1.506,
"grad_norm": 0.22917306423187256,
"learning_rate": 1.3270414567123342e-06,
"loss": 0.34,
"step": 765
},
{
"epoch": 1.508,
"grad_norm": 0.22837324440479279,
"learning_rate": 1.3162943106179748e-06,
"loss": 0.516,
"step": 766
},
{
"epoch": 1.51,
"grad_norm": 0.1973070204257965,
"learning_rate": 1.305584259399013e-06,
"loss": 0.2083,
"step": 767
},
{
"epoch": 1.512,
"grad_norm": 0.25936761498451233,
"learning_rate": 1.2949114109055417e-06,
"loss": 0.4483,
"step": 768
},
{
"epoch": 1.514,
"grad_norm": 0.23405812680721283,
"learning_rate": 1.2842758726130283e-06,
"loss": 0.3334,
"step": 769
},
{
"epoch": 1.516,
"grad_norm": 0.2227783501148224,
"learning_rate": 1.2736777516212267e-06,
"loss": 0.3724,
"step": 770
},
{
"epoch": 1.518,
"grad_norm": 0.23398268222808838,
"learning_rate": 1.263117154653097e-06,
"loss": 0.2008,
"step": 771
},
{
"epoch": 1.52,
"grad_norm": 0.16665144264698029,
"learning_rate": 1.2525941880537307e-06,
"loss": 0.2177,
"step": 772
},
{
"epoch": 1.522,
"grad_norm": 0.21703177690505981,
"learning_rate": 1.242108957789287e-06,
"loss": 0.2668,
"step": 773
},
{
"epoch": 1.524,
"grad_norm": 0.3440599739551544,
"learning_rate": 1.2316615694459188e-06,
"loss": 0.3352,
"step": 774
},
{
"epoch": 1.526,
"grad_norm": 0.2005206048488617,
"learning_rate": 1.2212521282287093e-06,
"loss": 0.2719,
"step": 775
},
{
"epoch": 1.528,
"grad_norm": 0.2054724395275116,
"learning_rate": 1.210880738960616e-06,
"loss": 0.3181,
"step": 776
},
{
"epoch": 1.53,
"grad_norm": 0.2903349995613098,
"learning_rate": 1.200547506081416e-06,
"loss": 0.3382,
"step": 777
},
{
"epoch": 1.532,
"grad_norm": 0.22862407565116882,
"learning_rate": 1.1902525336466465e-06,
"loss": 0.2544,
"step": 778
},
{
"epoch": 1.534,
"grad_norm": 0.20812873542308807,
"learning_rate": 1.1799959253265668e-06,
"loss": 0.3118,
"step": 779
},
{
"epoch": 1.536,
"grad_norm": 0.2820591330528259,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.3646,
"step": 780
},
{
"epoch": 1.538,
"grad_norm": 0.21943072974681854,
"learning_rate": 1.1595982137788403e-06,
"loss": 0.1957,
"step": 781
},
{
"epoch": 1.54,
"grad_norm": 0.1949055939912796,
"learning_rate": 1.1494573159559214e-06,
"loss": 0.253,
"step": 782
},
{
"epoch": 1.542,
"grad_norm": 0.20829080045223236,
"learning_rate": 1.1393551930550828e-06,
"loss": 0.2558,
"step": 783
},
{
"epoch": 1.544,
"grad_norm": 0.20741114020347595,
"learning_rate": 1.1292919468045876e-06,
"loss": 0.2221,
"step": 784
},
{
"epoch": 1.546,
"grad_norm": 0.24327073991298676,
"learning_rate": 1.1192676785412154e-06,
"loss": 0.2616,
"step": 785
},
{
"epoch": 1.548,
"grad_norm": 0.2541949152946472,
"learning_rate": 1.1092824892092375e-06,
"loss": 0.2435,
"step": 786
},
{
"epoch": 1.55,
"grad_norm": 0.2096426635980606,
"learning_rate": 1.099336479359398e-06,
"loss": 0.2448,
"step": 787
},
{
"epoch": 1.552,
"grad_norm": 0.24535740911960602,
"learning_rate": 1.0894297491479044e-06,
"loss": 0.2892,
"step": 788
},
{
"epoch": 1.554,
"grad_norm": 0.2067105919122696,
"learning_rate": 1.0795623983354214e-06,
"loss": 0.2584,
"step": 789
},
{
"epoch": 1.556,
"grad_norm": 0.2478252500295639,
"learning_rate": 1.0697345262860638e-06,
"loss": 0.3474,
"step": 790
},
{
"epoch": 1.558,
"grad_norm": 0.17269453406333923,
"learning_rate": 1.0599462319663906e-06,
"loss": 0.2793,
"step": 791
},
{
"epoch": 1.56,
"grad_norm": 0.2102997750043869,
"learning_rate": 1.0501976139444191e-06,
"loss": 0.3124,
"step": 792
},
{
"epoch": 1.562,
"grad_norm": 0.29494714736938477,
"learning_rate": 1.0404887703886252e-06,
"loss": 0.2693,
"step": 793
},
{
"epoch": 1.564,
"grad_norm": 0.19094854593276978,
"learning_rate": 1.0308197990669538e-06,
"loss": 0.3593,
"step": 794
},
{
"epoch": 1.5659999999999998,
"grad_norm": 0.20082080364227295,
"learning_rate": 1.0211907973458391e-06,
"loss": 0.2296,
"step": 795
},
{
"epoch": 1.568,
"grad_norm": 0.24483440816402435,
"learning_rate": 1.0116018621892237e-06,
"loss": 0.344,
"step": 796
},
{
"epoch": 1.5699999999999998,
"grad_norm": 0.21700353920459747,
"learning_rate": 1.0020530901575754e-06,
"loss": 0.2562,
"step": 797
},
{
"epoch": 1.572,
"grad_norm": 0.18885864317417145,
"learning_rate": 9.925445774069232e-07,
"loss": 0.2155,
"step": 798
},
{
"epoch": 1.5739999999999998,
"grad_norm": 0.2546456754207611,
"learning_rate": 9.830764196878872e-07,
"loss": 0.3539,
"step": 799
},
{
"epoch": 1.576,
"grad_norm": 0.20347674190998077,
"learning_rate": 9.73648712344707e-07,
"loss": 0.2864,
"step": 800
},
{
"epoch": 1.5779999999999998,
"grad_norm": 0.3315930962562561,
"learning_rate": 9.642615503142927e-07,
"loss": 0.3753,
"step": 801
},
{
"epoch": 1.58,
"grad_norm": 0.18244577944278717,
"learning_rate": 9.549150281252633e-07,
"loss": 0.2116,
"step": 802
},
{
"epoch": 1.5819999999999999,
"grad_norm": 0.24047374725341797,
"learning_rate": 9.456092398969902e-07,
"loss": 0.3352,
"step": 803
},
{
"epoch": 1.584,
"grad_norm": 0.2712211012840271,
"learning_rate": 9.363442793386606e-07,
"loss": 0.4647,
"step": 804
},
{
"epoch": 1.5859999999999999,
"grad_norm": 0.15284787118434906,
"learning_rate": 9.271202397483214e-07,
"loss": 0.2296,
"step": 805
},
{
"epoch": 1.588,
"grad_norm": 0.2665194571018219,
"learning_rate": 9.179372140119524e-07,
"loss": 0.353,
"step": 806
},
{
"epoch": 1.5899999999999999,
"grad_norm": 0.2965538799762726,
"learning_rate": 9.087952946025175e-07,
"loss": 0.2863,
"step": 807
},
{
"epoch": 1.592,
"grad_norm": 0.19379866123199463,
"learning_rate": 8.996945735790447e-07,
"loss": 0.3056,
"step": 808
},
{
"epoch": 1.5939999999999999,
"grad_norm": 0.2339809238910675,
"learning_rate": 8.906351425856952e-07,
"loss": 0.3741,
"step": 809
},
{
"epoch": 1.596,
"grad_norm": 0.2753208577632904,
"learning_rate": 8.816170928508367e-07,
"loss": 0.2715,
"step": 810
},
{
"epoch": 1.5979999999999999,
"grad_norm": 0.2367635816335678,
"learning_rate": 8.7264051518613e-07,
"loss": 0.3268,
"step": 811
},
{
"epoch": 1.6,
"grad_norm": 0.2004977911710739,
"learning_rate": 8.637054999856148e-07,
"loss": 0.2217,
"step": 812
},
{
"epoch": 1.6019999999999999,
"grad_norm": 0.3549105226993561,
"learning_rate": 8.54812137224792e-07,
"loss": 0.3371,
"step": 813
},
{
"epoch": 1.604,
"grad_norm": 0.27921661734580994,
"learning_rate": 8.459605164597268e-07,
"loss": 0.3983,
"step": 814
},
{
"epoch": 1.6059999999999999,
"grad_norm": 0.2014499306678772,
"learning_rate": 8.371507268261436e-07,
"loss": 0.2413,
"step": 815
},
{
"epoch": 1.608,
"grad_norm": 0.20690080523490906,
"learning_rate": 8.283828570385239e-07,
"loss": 0.2012,
"step": 816
},
{
"epoch": 1.6099999999999999,
"grad_norm": 0.21998871862888336,
"learning_rate": 8.196569953892202e-07,
"loss": 0.3298,
"step": 817
},
{
"epoch": 1.612,
"grad_norm": 0.3980468511581421,
"learning_rate": 8.109732297475637e-07,
"loss": 0.3194,
"step": 818
},
{
"epoch": 1.6139999999999999,
"grad_norm": 0.20355728268623352,
"learning_rate": 8.023316475589754e-07,
"loss": 0.1823,
"step": 819
},
{
"epoch": 1.616,
"grad_norm": 0.17916588485240936,
"learning_rate": 7.937323358440935e-07,
"loss": 0.2189,
"step": 820
},
{
"epoch": 1.6179999999999999,
"grad_norm": 0.3024926781654358,
"learning_rate": 7.851753811978924e-07,
"loss": 0.3149,
"step": 821
},
{
"epoch": 1.62,
"grad_norm": 0.20770519971847534,
"learning_rate": 7.766608697888095e-07,
"loss": 0.2967,
"step": 822
},
{
"epoch": 1.6219999999999999,
"grad_norm": 0.2985385060310364,
"learning_rate": 7.681888873578786e-07,
"loss": 0.3245,
"step": 823
},
{
"epoch": 1.624,
"grad_norm": 0.238825723528862,
"learning_rate": 7.597595192178702e-07,
"loss": 0.2024,
"step": 824
},
{
"epoch": 1.626,
"grad_norm": 0.24210689961910248,
"learning_rate": 7.513728502524286e-07,
"loss": 0.3364,
"step": 825
},
{
"epoch": 1.6280000000000001,
"grad_norm": 0.2465432733297348,
"learning_rate": 7.430289649152156e-07,
"loss": 0.3643,
"step": 826
},
{
"epoch": 1.63,
"grad_norm": 0.37851664423942566,
"learning_rate": 7.347279472290647e-07,
"loss": 0.4549,
"step": 827
},
{
"epoch": 1.6320000000000001,
"grad_norm": 0.29046836495399475,
"learning_rate": 7.264698807851328e-07,
"loss": 0.3777,
"step": 828
},
{
"epoch": 1.634,
"grad_norm": 0.17954066395759583,
"learning_rate": 7.182548487420555e-07,
"loss": 0.1817,
"step": 829
},
{
"epoch": 1.6360000000000001,
"grad_norm": 0.21587719023227692,
"learning_rate": 7.100829338251147e-07,
"loss": 0.3208,
"step": 830
},
{
"epoch": 1.638,
"grad_norm": 0.24211935698986053,
"learning_rate": 7.019542183254047e-07,
"loss": 0.302,
"step": 831
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.3430536389350891,
"learning_rate": 6.938687840989972e-07,
"loss": 0.3358,
"step": 832
},
{
"epoch": 1.642,
"grad_norm": 0.26358646154403687,
"learning_rate": 6.858267125661272e-07,
"loss": 0.3329,
"step": 833
},
{
"epoch": 1.6440000000000001,
"grad_norm": 0.21013550460338593,
"learning_rate": 6.778280847103668e-07,
"loss": 0.247,
"step": 834
},
{
"epoch": 1.646,
"grad_norm": 0.17694292962551117,
"learning_rate": 6.698729810778065e-07,
"loss": 0.2205,
"step": 835
},
{
"epoch": 1.6480000000000001,
"grad_norm": 0.15793128311634064,
"learning_rate": 6.619614817762537e-07,
"loss": 0.1541,
"step": 836
},
{
"epoch": 1.65,
"grad_norm": 0.18143923580646515,
"learning_rate": 6.540936664744197e-07,
"loss": 0.2367,
"step": 837
},
{
"epoch": 1.6520000000000001,
"grad_norm": 0.21212640404701233,
"learning_rate": 6.462696144011149e-07,
"loss": 0.3049,
"step": 838
},
{
"epoch": 1.654,
"grad_norm": 0.21567395329475403,
"learning_rate": 6.384894043444568e-07,
"loss": 0.2519,
"step": 839
},
{
"epoch": 1.6560000000000001,
"grad_norm": 0.17464697360992432,
"learning_rate": 6.307531146510754e-07,
"loss": 0.1692,
"step": 840
},
{
"epoch": 1.658,
"grad_norm": 0.23152326047420502,
"learning_rate": 6.230608232253227e-07,
"loss": 0.2823,
"step": 841
},
{
"epoch": 1.6600000000000001,
"grad_norm": 0.3341864049434662,
"learning_rate": 6.154126075284855e-07,
"loss": 0.2823,
"step": 842
},
{
"epoch": 1.662,
"grad_norm": 0.24136964976787567,
"learning_rate": 6.07808544578013e-07,
"loss": 0.3713,
"step": 843
},
{
"epoch": 1.6640000000000001,
"grad_norm": 0.21439406275749207,
"learning_rate": 6.002487109467347e-07,
"loss": 0.2631,
"step": 844
},
{
"epoch": 1.666,
"grad_norm": 0.3102458715438843,
"learning_rate": 5.927331827620902e-07,
"loss": 0.3513,
"step": 845
},
{
"epoch": 1.6680000000000001,
"grad_norm": 0.20326466858386993,
"learning_rate": 5.852620357053651e-07,
"loss": 0.2738,
"step": 846
},
{
"epoch": 1.67,
"grad_norm": 0.185090109705925,
"learning_rate": 5.778353450109286e-07,
"loss": 0.2665,
"step": 847
},
{
"epoch": 1.6720000000000002,
"grad_norm": 0.17061105370521545,
"learning_rate": 5.704531854654721e-07,
"loss": 0.2018,
"step": 848
},
{
"epoch": 1.674,
"grad_norm": 0.18026676774024963,
"learning_rate": 5.631156314072605e-07,
"loss": 0.2182,
"step": 849
},
{
"epoch": 1.6760000000000002,
"grad_norm": 0.24431855976581573,
"learning_rate": 5.558227567253832e-07,
"loss": 0.3036,
"step": 850
},
{
"epoch": 1.678,
"grad_norm": 0.1817561835050583,
"learning_rate": 5.485746348590048e-07,
"loss": 0.2786,
"step": 851
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.20034758746623993,
"learning_rate": 5.413713387966329e-07,
"loss": 0.2073,
"step": 852
},
{
"epoch": 1.682,
"grad_norm": 0.23046346008777618,
"learning_rate": 5.34212941075381e-07,
"loss": 0.2456,
"step": 853
},
{
"epoch": 1.6840000000000002,
"grad_norm": 0.28231683373451233,
"learning_rate": 5.270995137802315e-07,
"loss": 0.2962,
"step": 854
},
{
"epoch": 1.686,
"grad_norm": 0.20535282790660858,
"learning_rate": 5.200311285433213e-07,
"loss": 0.2003,
"step": 855
},
{
"epoch": 1.688,
"grad_norm": 0.27334460616111755,
"learning_rate": 5.130078565432089e-07,
"loss": 0.2784,
"step": 856
},
{
"epoch": 1.69,
"grad_norm": 0.2541443109512329,
"learning_rate": 5.06029768504166e-07,
"loss": 0.3575,
"step": 857
},
{
"epoch": 1.692,
"grad_norm": 0.20568181574344635,
"learning_rate": 4.990969346954611e-07,
"loss": 0.3116,
"step": 858
},
{
"epoch": 1.694,
"grad_norm": 0.2725497782230377,
"learning_rate": 4.922094249306559e-07,
"loss": 0.2698,
"step": 859
},
{
"epoch": 1.696,
"grad_norm": 0.2767050862312317,
"learning_rate": 4.853673085668947e-07,
"loss": 0.3246,
"step": 860
},
{
"epoch": 1.698,
"grad_norm": 0.27081194519996643,
"learning_rate": 4.785706545042141e-07,
"loss": 0.3067,
"step": 861
},
{
"epoch": 1.7,
"grad_norm": 0.2148142009973526,
"learning_rate": 4.7181953118484556e-07,
"loss": 0.335,
"step": 862
},
{
"epoch": 1.702,
"grad_norm": 0.20924992859363556,
"learning_rate": 4.651140065925269e-07,
"loss": 0.2473,
"step": 863
},
{
"epoch": 1.704,
"grad_norm": 0.1969323456287384,
"learning_rate": 4.58454148251814e-07,
"loss": 0.2384,
"step": 864
},
{
"epoch": 1.706,
"grad_norm": 0.21272586286067963,
"learning_rate": 4.5184002322740784e-07,
"loss": 0.1894,
"step": 865
},
{
"epoch": 1.708,
"grad_norm": 0.22230306267738342,
"learning_rate": 4.4527169812347446e-07,
"loss": 0.2878,
"step": 866
},
{
"epoch": 1.71,
"grad_norm": 0.23957069218158722,
"learning_rate": 4.387492390829734e-07,
"loss": 0.2608,
"step": 867
},
{
"epoch": 1.712,
"grad_norm": 0.19603803753852844,
"learning_rate": 4.322727117869951e-07,
"loss": 0.2291,
"step": 868
},
{
"epoch": 1.714,
"grad_norm": 0.19814668595790863,
"learning_rate": 4.2584218145409916e-07,
"loss": 0.2933,
"step": 869
},
{
"epoch": 1.716,
"grad_norm": 0.2840145230293274,
"learning_rate": 4.194577128396521e-07,
"loss": 0.2678,
"step": 870
},
{
"epoch": 1.718,
"grad_norm": 0.3841419816017151,
"learning_rate": 4.131193702351827e-07,
"loss": 0.4492,
"step": 871
},
{
"epoch": 1.72,
"grad_norm": 0.1749158352613449,
"learning_rate": 4.0682721746773346e-07,
"loss": 0.2205,
"step": 872
},
{
"epoch": 1.722,
"grad_norm": 0.22776730358600616,
"learning_rate": 4.005813178992091e-07,
"loss": 0.2634,
"step": 873
},
{
"epoch": 1.724,
"grad_norm": 0.20322760939598083,
"learning_rate": 3.9438173442575e-07,
"loss": 0.3125,
"step": 874
},
{
"epoch": 1.726,
"grad_norm": 0.24371430277824402,
"learning_rate": 3.882285294770938e-07,
"loss": 0.3223,
"step": 875
},
{
"epoch": 1.726,
"eval_loss": 0.26352861523628235,
"eval_runtime": 76.577,
"eval_samples_per_second": 7.208,
"eval_steps_per_second": 0.901,
"step": 875
},
{
"epoch": 1.728,
"grad_norm": 0.2777194678783417,
"learning_rate": 3.821217650159453e-07,
"loss": 0.3117,
"step": 876
},
{
"epoch": 1.73,
"grad_norm": 0.21060119569301605,
"learning_rate": 3.760615025373543e-07,
"loss": 0.2444,
"step": 877
},
{
"epoch": 1.732,
"grad_norm": 0.19364982843399048,
"learning_rate": 3.7004780306809873e-07,
"loss": 0.2534,
"step": 878
},
{
"epoch": 1.734,
"grad_norm": 0.2388126105070114,
"learning_rate": 3.6408072716606346e-07,
"loss": 0.5307,
"step": 879
},
{
"epoch": 1.736,
"grad_norm": 0.21501779556274414,
"learning_rate": 3.581603349196372e-07,
"loss": 0.299,
"step": 880
},
{
"epoch": 1.738,
"grad_norm": 0.2748852074146271,
"learning_rate": 3.522866859471047e-07,
"loss": 0.4626,
"step": 881
},
{
"epoch": 1.74,
"grad_norm": 0.2657471299171448,
"learning_rate": 3.46459839396045e-07,
"loss": 0.2947,
"step": 882
},
{
"epoch": 1.742,
"grad_norm": 0.1825701743364334,
"learning_rate": 3.406798539427386e-07,
"loss": 0.2525,
"step": 883
},
{
"epoch": 1.744,
"grad_norm": 0.18898171186447144,
"learning_rate": 3.3494678779157464e-07,
"loss": 0.2188,
"step": 884
},
{
"epoch": 1.746,
"grad_norm": 0.2019154280424118,
"learning_rate": 3.2926069867446673e-07,
"loss": 0.2575,
"step": 885
},
{
"epoch": 1.748,
"grad_norm": 0.26931118965148926,
"learning_rate": 3.2362164385026704e-07,
"loss": 0.2867,
"step": 886
},
{
"epoch": 1.75,
"grad_norm": 0.25869134068489075,
"learning_rate": 3.180296801041971e-07,
"loss": 0.4233,
"step": 887
},
{
"epoch": 1.752,
"grad_norm": 0.24689964950084686,
"learning_rate": 3.1248486374726884e-07,
"loss": 0.3778,
"step": 888
},
{
"epoch": 1.754,
"grad_norm": 0.2961515486240387,
"learning_rate": 3.069872506157212e-07,
"loss": 0.3767,
"step": 889
},
{
"epoch": 1.756,
"grad_norm": 0.2758214473724365,
"learning_rate": 3.015368960704584e-07,
"loss": 0.4107,
"step": 890
},
{
"epoch": 1.758,
"grad_norm": 0.19258597493171692,
"learning_rate": 2.9613385499648926e-07,
"loss": 0.2285,
"step": 891
},
{
"epoch": 1.76,
"grad_norm": 0.21885156631469727,
"learning_rate": 2.9077818180237693e-07,
"loss": 0.2726,
"step": 892
},
{
"epoch": 1.762,
"grad_norm": 0.20850767195224762,
"learning_rate": 2.8546993041969173e-07,
"loss": 0.3443,
"step": 893
},
{
"epoch": 1.764,
"grad_norm": 0.22747254371643066,
"learning_rate": 2.802091543024671e-07,
"loss": 0.2785,
"step": 894
},
{
"epoch": 1.766,
"grad_norm": 0.18733809888362885,
"learning_rate": 2.7499590642665773e-07,
"loss": 0.2047,
"step": 895
},
{
"epoch": 1.768,
"grad_norm": 0.230934277176857,
"learning_rate": 2.6983023928961406e-07,
"loss": 0.2994,
"step": 896
},
{
"epoch": 1.77,
"grad_norm": 0.1833610087633133,
"learning_rate": 2.647122049095463e-07,
"loss": 0.2064,
"step": 897
},
{
"epoch": 1.772,
"grad_norm": 0.2077609896659851,
"learning_rate": 2.596418548250029e-07,
"loss": 0.2537,
"step": 898
},
{
"epoch": 1.774,
"grad_norm": 0.163072407245636,
"learning_rate": 2.546192400943537e-07,
"loss": 0.194,
"step": 899
},
{
"epoch": 1.776,
"grad_norm": 0.1943567395210266,
"learning_rate": 2.4964441129527337e-07,
"loss": 0.2519,
"step": 900
},
{
"epoch": 1.778,
"grad_norm": 0.18382684886455536,
"learning_rate": 2.447174185242324e-07,
"loss": 0.1944,
"step": 901
},
{
"epoch": 1.78,
"grad_norm": 0.20981475710868835,
"learning_rate": 2.398383113959929e-07,
"loss": 0.173,
"step": 902
},
{
"epoch": 1.782,
"grad_norm": 0.1996649205684662,
"learning_rate": 2.3500713904311023e-07,
"loss": 0.2536,
"step": 903
},
{
"epoch": 1.784,
"grad_norm": 0.2560986578464508,
"learning_rate": 2.3022395011543687e-07,
"loss": 0.374,
"step": 904
},
{
"epoch": 1.786,
"grad_norm": 0.20811672508716583,
"learning_rate": 2.2548879277963065e-07,
"loss": 0.3225,
"step": 905
},
{
"epoch": 1.788,
"grad_norm": 0.1996699571609497,
"learning_rate": 2.2080171471867362e-07,
"loss": 0.2632,
"step": 906
},
{
"epoch": 1.79,
"grad_norm": 0.20678700506687164,
"learning_rate": 2.161627631313923e-07,
"loss": 0.3513,
"step": 907
},
{
"epoch": 1.792,
"grad_norm": 0.20172181725502014,
"learning_rate": 2.1157198473197417e-07,
"loss": 0.2117,
"step": 908
},
{
"epoch": 1.794,
"grad_norm": 0.16854679584503174,
"learning_rate": 2.0702942574950812e-07,
"loss": 0.3006,
"step": 909
},
{
"epoch": 1.796,
"grad_norm": 0.1959567815065384,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.2695,
"step": 910
},
{
"epoch": 1.798,
"grad_norm": 0.1726803481578827,
"learning_rate": 1.9808914852347817e-07,
"loss": 0.2635,
"step": 911
},
{
"epoch": 1.8,
"grad_norm": 0.22450147569179535,
"learning_rate": 1.9369152030840553e-07,
"loss": 0.2598,
"step": 912
},
{
"epoch": 1.802,
"grad_norm": 0.26783040165901184,
"learning_rate": 1.8934229156636453e-07,
"loss": 0.2029,
"step": 913
},
{
"epoch": 1.804,
"grad_norm": 0.2690034508705139,
"learning_rate": 1.8504150609403858e-07,
"loss": 0.2446,
"step": 914
},
{
"epoch": 1.806,
"grad_norm": 0.23306065797805786,
"learning_rate": 1.807892072002898e-07,
"loss": 0.3264,
"step": 915
},
{
"epoch": 1.808,
"grad_norm": 0.2681446075439453,
"learning_rate": 1.765854377057219e-07,
"loss": 0.302,
"step": 916
},
{
"epoch": 1.81,
"grad_norm": 0.19500699639320374,
"learning_rate": 1.724302399422456e-07,
"loss": 0.2066,
"step": 917
},
{
"epoch": 1.812,
"grad_norm": 0.2524206340312958,
"learning_rate": 1.6832365575265742e-07,
"loss": 0.3334,
"step": 918
},
{
"epoch": 1.814,
"grad_norm": 0.2076834887266159,
"learning_rate": 1.6426572649021477e-07,
"loss": 0.2737,
"step": 919
},
{
"epoch": 1.8159999999999998,
"grad_norm": 0.28093916177749634,
"learning_rate": 1.6025649301821877e-07,
"loss": 0.3558,
"step": 920
},
{
"epoch": 1.818,
"grad_norm": 0.24566200375556946,
"learning_rate": 1.562959957096072e-07,
"loss": 0.3636,
"step": 921
},
{
"epoch": 1.8199999999999998,
"grad_norm": 0.2996765077114105,
"learning_rate": 1.5238427444654368e-07,
"loss": 0.3945,
"step": 922
},
{
"epoch": 1.822,
"grad_norm": 0.24855782091617584,
"learning_rate": 1.4852136862001766e-07,
"loss": 0.1894,
"step": 923
},
{
"epoch": 1.8239999999999998,
"grad_norm": 0.2089153230190277,
"learning_rate": 1.4470731712944885e-07,
"loss": 0.3297,
"step": 924
},
{
"epoch": 1.826,
"grad_norm": 0.3130733072757721,
"learning_rate": 1.4094215838229176e-07,
"loss": 0.4001,
"step": 925
},
{
"epoch": 1.8279999999999998,
"grad_norm": 0.2722707688808441,
"learning_rate": 1.372259302936546e-07,
"loss": 0.356,
"step": 926
},
{
"epoch": 1.83,
"grad_norm": 0.15767575800418854,
"learning_rate": 1.3355867028591209e-07,
"loss": 0.2161,
"step": 927
},
{
"epoch": 1.8319999999999999,
"grad_norm": 0.18771317601203918,
"learning_rate": 1.2994041528833267e-07,
"loss": 0.1912,
"step": 928
},
{
"epoch": 1.834,
"grad_norm": 0.15640737116336823,
"learning_rate": 1.263712017367036e-07,
"loss": 0.2173,
"step": 929
},
{
"epoch": 1.8359999999999999,
"grad_norm": 0.2588789463043213,
"learning_rate": 1.2285106557296479e-07,
"loss": 0.3506,
"step": 930
},
{
"epoch": 1.838,
"grad_norm": 0.21290963888168335,
"learning_rate": 1.193800422448499e-07,
"loss": 0.2377,
"step": 931
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.198676198720932,
"learning_rate": 1.1595816670552429e-07,
"loss": 0.1823,
"step": 932
},
{
"epoch": 1.842,
"grad_norm": 0.23629765212535858,
"learning_rate": 1.12585473413237e-07,
"loss": 0.2565,
"step": 933
},
{
"epoch": 1.8439999999999999,
"grad_norm": 0.23395268619060516,
"learning_rate": 1.0926199633097156e-07,
"loss": 0.2184,
"step": 934
},
{
"epoch": 1.846,
"grad_norm": 0.2589554190635681,
"learning_rate": 1.0598776892610685e-07,
"loss": 0.369,
"step": 935
},
{
"epoch": 1.8479999999999999,
"grad_norm": 0.22093115746974945,
"learning_rate": 1.0276282417007399e-07,
"loss": 0.3437,
"step": 936
},
{
"epoch": 1.85,
"grad_norm": 0.23697194457054138,
"learning_rate": 9.958719453803278e-08,
"loss": 0.3288,
"step": 937
},
{
"epoch": 1.8519999999999999,
"grad_norm": 0.22383596003055573,
"learning_rate": 9.646091200853802e-08,
"loss": 0.4897,
"step": 938
},
{
"epoch": 1.854,
"grad_norm": 0.20475724339485168,
"learning_rate": 9.338400806321979e-08,
"loss": 0.257,
"step": 939
},
{
"epoch": 1.8559999999999999,
"grad_norm": 0.263615220785141,
"learning_rate": 9.035651368646647e-08,
"loss": 0.4592,
"step": 940
},
{
"epoch": 1.858,
"grad_norm": 0.24478185176849365,
"learning_rate": 8.737845936511335e-08,
"loss": 0.4337,
"step": 941
},
{
"epoch": 1.8599999999999999,
"grad_norm": 0.2436402142047882,
"learning_rate": 8.444987508813451e-08,
"loss": 0.3344,
"step": 942
},
{
"epoch": 1.862,
"grad_norm": 0.23337677121162415,
"learning_rate": 8.157079034633974e-08,
"loss": 0.2967,
"step": 943
},
{
"epoch": 1.8639999999999999,
"grad_norm": 0.20073962211608887,
"learning_rate": 7.874123413208145e-08,
"loss": 0.1952,
"step": 944
},
{
"epoch": 1.866,
"grad_norm": 0.2582467496395111,
"learning_rate": 7.59612349389599e-08,
"loss": 0.372,
"step": 945
},
{
"epoch": 1.8679999999999999,
"grad_norm": 0.2121819704771042,
"learning_rate": 7.32308207615351e-08,
"loss": 0.2619,
"step": 946
},
{
"epoch": 1.87,
"grad_norm": 0.16836410760879517,
"learning_rate": 7.055001909504755e-08,
"loss": 0.293,
"step": 947
},
{
"epoch": 1.8719999999999999,
"grad_norm": 0.18819768726825714,
"learning_rate": 6.791885693514134e-08,
"loss": 0.2476,
"step": 948
},
{
"epoch": 1.874,
"grad_norm": 0.2157561331987381,
"learning_rate": 6.533736077758868e-08,
"loss": 0.2615,
"step": 949
},
{
"epoch": 1.876,
"grad_norm": 0.24670301377773285,
"learning_rate": 6.280555661802857e-08,
"loss": 0.371,
"step": 950
},
{
"epoch": 1.8780000000000001,
"grad_norm": 0.21483668684959412,
"learning_rate": 6.032346995169968e-08,
"loss": 0.2231,
"step": 951
},
{
"epoch": 1.88,
"grad_norm": 0.1763847917318344,
"learning_rate": 5.7891125773187896e-08,
"loss": 0.2074,
"step": 952
},
{
"epoch": 1.8820000000000001,
"grad_norm": 0.20190970599651337,
"learning_rate": 5.550854857617194e-08,
"loss": 0.3226,
"step": 953
},
{
"epoch": 1.884,
"grad_norm": 0.23266001045703888,
"learning_rate": 5.3175762353177563e-08,
"loss": 0.3055,
"step": 954
},
{
"epoch": 1.8860000000000001,
"grad_norm": 0.26426488161087036,
"learning_rate": 5.089279059533658e-08,
"loss": 0.3319,
"step": 955
},
{
"epoch": 1.888,
"grad_norm": 0.24322916567325592,
"learning_rate": 4.865965629214819e-08,
"loss": 0.2372,
"step": 956
},
{
"epoch": 1.8900000000000001,
"grad_norm": 0.23628686368465424,
"learning_rate": 4.6476381931251366e-08,
"loss": 0.3808,
"step": 957
},
{
"epoch": 1.892,
"grad_norm": 0.16934725642204285,
"learning_rate": 4.434298949819449e-08,
"loss": 0.1737,
"step": 958
},
{
"epoch": 1.8940000000000001,
"grad_norm": 0.30660754442214966,
"learning_rate": 4.225950047621441e-08,
"loss": 0.3483,
"step": 959
},
{
"epoch": 1.896,
"grad_norm": 0.27640894055366516,
"learning_rate": 4.02259358460233e-08,
"loss": 0.3264,
"step": 960
},
{
"epoch": 1.8980000000000001,
"grad_norm": 0.2123912125825882,
"learning_rate": 3.8242316085594923e-08,
"loss": 0.3876,
"step": 961
},
{
"epoch": 1.9,
"grad_norm": 0.2987152636051178,
"learning_rate": 3.630866116995757e-08,
"loss": 0.4525,
"step": 962
},
{
"epoch": 1.9020000000000001,
"grad_norm": 0.22001074254512787,
"learning_rate": 3.44249905709948e-08,
"loss": 0.1842,
"step": 963
},
{
"epoch": 1.904,
"grad_norm": 0.20775096118450165,
"learning_rate": 3.25913232572489e-08,
"loss": 0.3012,
"step": 964
},
{
"epoch": 1.9060000000000001,
"grad_norm": 0.19180834293365479,
"learning_rate": 3.080767769372939e-08,
"loss": 0.2681,
"step": 965
},
{
"epoch": 1.908,
"grad_norm": 0.22222468256950378,
"learning_rate": 2.907407184172706e-08,
"loss": 0.1809,
"step": 966
},
{
"epoch": 1.9100000000000001,
"grad_norm": 0.20555076003074646,
"learning_rate": 2.7390523158633552e-08,
"loss": 0.1482,
"step": 967
},
{
"epoch": 1.912,
"grad_norm": 0.29668375849723816,
"learning_rate": 2.57570485977654e-08,
"loss": 0.2179,
"step": 968
},
{
"epoch": 1.9140000000000001,
"grad_norm": 0.19830183684825897,
"learning_rate": 2.4173664608193592e-08,
"loss": 0.2677,
"step": 969
},
{
"epoch": 1.916,
"grad_norm": 0.23050029575824738,
"learning_rate": 2.264038713457706e-08,
"loss": 0.3348,
"step": 970
},
{
"epoch": 1.9180000000000001,
"grad_norm": 0.36921679973602295,
"learning_rate": 2.1157231617002783e-08,
"loss": 0.4821,
"step": 971
},
{
"epoch": 1.92,
"grad_norm": 0.16172367334365845,
"learning_rate": 1.9724212990830938e-08,
"loss": 0.2348,
"step": 972
},
{
"epoch": 1.9220000000000002,
"grad_norm": 0.18016183376312256,
"learning_rate": 1.834134568654333e-08,
"loss": 0.2486,
"step": 973
},
{
"epoch": 1.924,
"grad_norm": 0.32527899742126465,
"learning_rate": 1.7008643629596866e-08,
"loss": 0.3623,
"step": 974
},
{
"epoch": 1.9260000000000002,
"grad_norm": 0.21802493929862976,
"learning_rate": 1.5726120240288632e-08,
"loss": 0.2155,
"step": 975
},
{
"epoch": 1.928,
"grad_norm": 0.23393763601779938,
"learning_rate": 1.449378843361271e-08,
"loss": 0.284,
"step": 976
},
{
"epoch": 1.9300000000000002,
"grad_norm": 0.2498655915260315,
"learning_rate": 1.3311660619138578e-08,
"loss": 0.2816,
"step": 977
},
{
"epoch": 1.932,
"grad_norm": 0.20273719727993011,
"learning_rate": 1.2179748700879013e-08,
"loss": 0.2945,
"step": 978
},
{
"epoch": 1.9340000000000002,
"grad_norm": 0.16979333758354187,
"learning_rate": 1.109806407717462e-08,
"loss": 0.1949,
"step": 979
},
{
"epoch": 1.936,
"grad_norm": 0.18881943821907043,
"learning_rate": 1.006661764057837e-08,
"loss": 0.2681,
"step": 980
},
{
"epoch": 1.938,
"grad_norm": 0.23016507923603058,
"learning_rate": 9.085419777743465e-09,
"loss": 0.4162,
"step": 981
},
{
"epoch": 1.94,
"grad_norm": 0.21829769015312195,
"learning_rate": 8.15448036932176e-09,
"loss": 0.3911,
"step": 982
},
{
"epoch": 1.942,
"grad_norm": 0.192356139421463,
"learning_rate": 7.273808789862724e-09,
"loss": 0.3076,
"step": 983
},
{
"epoch": 1.944,
"grad_norm": 0.20806097984313965,
"learning_rate": 6.4434139077201865e-09,
"loss": 0.2808,
"step": 984
},
{
"epoch": 1.946,
"grad_norm": 0.2533554434776306,
"learning_rate": 5.6633040849601865e-09,
"loss": 0.264,
"step": 985
},
{
"epoch": 1.948,
"grad_norm": 0.25440603494644165,
"learning_rate": 4.933487177280483e-09,
"loss": 0.386,
"step": 986
},
{
"epoch": 1.95,
"grad_norm": 0.2403300553560257,
"learning_rate": 4.253970533929508e-09,
"loss": 0.2665,
"step": 987
},
{
"epoch": 1.952,
"grad_norm": 0.18095187842845917,
"learning_rate": 3.6247609976319818e-09,
"loss": 0.2414,
"step": 988
},
{
"epoch": 1.954,
"grad_norm": 0.43698740005493164,
"learning_rate": 3.0458649045211897e-09,
"loss": 0.4131,
"step": 989
},
{
"epoch": 1.956,
"grad_norm": 0.2908496856689453,
"learning_rate": 2.5172880840745873e-09,
"loss": 0.2955,
"step": 990
},
{
"epoch": 1.958,
"grad_norm": 0.19435322284698486,
"learning_rate": 2.0390358590538507e-09,
"loss": 0.1839,
"step": 991
},
{
"epoch": 1.96,
"grad_norm": 0.20639224350452423,
"learning_rate": 1.61111304545436e-09,
"loss": 0.336,
"step": 992
},
{
"epoch": 1.962,
"grad_norm": 0.18591168522834778,
"learning_rate": 1.2335239524541298e-09,
"loss": 0.2653,
"step": 993
},
{
"epoch": 1.964,
"grad_norm": 0.2295517921447754,
"learning_rate": 9.062723823710651e-10,
"loss": 0.3478,
"step": 994
},
{
"epoch": 1.966,
"grad_norm": 0.2810915410518646,
"learning_rate": 6.293616306246586e-10,
"loss": 0.3266,
"step": 995
},
{
"epoch": 1.968,
"grad_norm": 0.19316555559635162,
"learning_rate": 4.027944857032395e-10,
"loss": 0.2753,
"step": 996
},
{
"epoch": 1.97,
"grad_norm": 0.24243375658988953,
"learning_rate": 2.265732291356626e-10,
"loss": 0.2786,
"step": 997
},
{
"epoch": 1.972,
"grad_norm": 0.27688726782798767,
"learning_rate": 1.0069963546743833e-10,
"loss": 0.2615,
"step": 998
},
{
"epoch": 1.974,
"grad_norm": 0.18696589767932892,
"learning_rate": 2.5174972244634834e-11,
"loss": 0.2866,
"step": 999
},
{
"epoch": 1.976,
"grad_norm": 0.21791526675224304,
"learning_rate": 0.0,
"loss": 0.2074,
"step": 1000
},
{
"epoch": 1.976,
"eval_loss": 0.26330506801605225,
"eval_runtime": 76.7272,
"eval_samples_per_second": 7.194,
"eval_steps_per_second": 0.899,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.531674674724864e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}