{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.001801801801802, "eval_steps": 500, "global_step": 833, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010810810810810811, "grad_norm": 12.305854329298121, "learning_rate": 2.9999652701989443e-05, "loss": 2.4911, "step": 3 }, { "epoch": 0.021621621621621623, "grad_norm": 4.240144865945876, "learning_rate": 2.9998610824039904e-05, "loss": 2.0652, "step": 6 }, { "epoch": 0.032432432432432434, "grad_norm": 3.2725279352000864, "learning_rate": 2.9996874414396984e-05, "loss": 1.9611, "step": 9 }, { "epoch": 0.043243243243243246, "grad_norm": 3.303814919197607, "learning_rate": 2.9994443553467584e-05, "loss": 2.0248, "step": 12 }, { "epoch": 0.05405405405405406, "grad_norm": 3.003679341904051, "learning_rate": 2.9991318353816112e-05, "loss": 1.9903, "step": 15 }, { "epoch": 0.06486486486486487, "grad_norm": 2.986093823263424, "learning_rate": 2.9987498960159325e-05, "loss": 1.8813, "step": 18 }, { "epoch": 0.07567567567567568, "grad_norm": 2.790555184817465, "learning_rate": 2.99829855493596e-05, "loss": 1.976, "step": 21 }, { "epoch": 0.08648648648648649, "grad_norm": 2.986932414712697, "learning_rate": 2.997777833041674e-05, "loss": 1.9653, "step": 24 }, { "epoch": 0.0972972972972973, "grad_norm": 2.656324556829213, "learning_rate": 2.9971877544458325e-05, "loss": 1.8551, "step": 27 }, { "epoch": 0.10810810810810811, "grad_norm": 3.2168328469993335, "learning_rate": 2.996528346472851e-05, "loss": 1.9812, "step": 30 }, { "epoch": 0.11891891891891893, "grad_norm": 2.696843177932571, "learning_rate": 2.9957996396575407e-05, "loss": 1.9728, "step": 33 }, { "epoch": 0.12972972972972974, "grad_norm": 3.2884895488967603, "learning_rate": 2.995001667743691e-05, "loss": 2.0118, "step": 36 }, { "epoch": 0.14054054054054055, "grad_norm": 2.7402908143169555, "learning_rate": 2.9941344676825106e-05, "loss": 1.9478, "step": 39 }, { "epoch": 0.15135135135135136, "grad_norm": 2.375732605463657, "learning_rate": 2.993198079630913e-05, "loss": 1.9092, "step": 42 }, { "epoch": 0.16216216216216217, "grad_norm": 2.6770114768040143, "learning_rate": 2.9921925469496594e-05, "loss": 2.0062, "step": 45 }, { "epoch": 0.17297297297297298, "grad_norm": 2.7654577179498943, "learning_rate": 2.9911179162013495e-05, "loss": 1.9634, "step": 48 }, { "epoch": 0.1837837837837838, "grad_norm": 2.9383093484984966, "learning_rate": 2.9899742371482663e-05, "loss": 1.9553, "step": 51 }, { "epoch": 0.1945945945945946, "grad_norm": 2.5901057077516443, "learning_rate": 2.988761562750071e-05, "loss": 1.9399, "step": 54 }, { "epoch": 0.20540540540540542, "grad_norm": 2.381483900812707, "learning_rate": 2.9874799491613513e-05, "loss": 1.9434, "step": 57 }, { "epoch": 0.21621621621621623, "grad_norm": 2.4229562035711263, "learning_rate": 2.9861294557290205e-05, "loss": 1.9481, "step": 60 }, { "epoch": 0.22702702702702704, "grad_norm": 2.5010028477455393, "learning_rate": 2.9847101449895692e-05, "loss": 1.9495, "step": 63 }, { "epoch": 0.23783783783783785, "grad_norm": 2.3138487877840244, "learning_rate": 2.9832220826661707e-05, "loss": 1.9259, "step": 66 }, { "epoch": 0.24864864864864866, "grad_norm": 2.5123033559714245, "learning_rate": 2.981665337665636e-05, "loss": 1.9731, "step": 69 }, { "epoch": 0.2594594594594595, "grad_norm": 2.6439754352664644, "learning_rate": 2.9800399820752236e-05, "loss": 1.9819, "step": 72 }, { "epoch": 0.2702702702702703, "grad_norm": 2.3833028996710746, "learning_rate": 2.9783460911593024e-05, "loss": 1.8926, "step": 75 }, { "epoch": 0.2810810810810811, "grad_norm": 2.4316596585025687, "learning_rate": 2.9765837433558652e-05, "loss": 1.9008, "step": 78 }, { "epoch": 0.2918918918918919, "grad_norm": 2.492002538880973, "learning_rate": 2.9747530202728965e-05, "loss": 1.9837, "step": 81 }, { "epoch": 0.3027027027027027, "grad_norm": 2.50223915481049, "learning_rate": 2.9728540066845944e-05, "loss": 1.9134, "step": 84 }, { "epoch": 0.31351351351351353, "grad_norm": 2.79878301554176, "learning_rate": 2.9708867905274444e-05, "loss": 1.9226, "step": 87 }, { "epoch": 0.32432432432432434, "grad_norm": 2.2798098795033477, "learning_rate": 2.9688514628961473e-05, "loss": 1.9029, "step": 90 }, { "epoch": 0.33513513513513515, "grad_norm": 2.2032821621350536, "learning_rate": 2.966748118039402e-05, "loss": 1.917, "step": 93 }, { "epoch": 0.34594594594594597, "grad_norm": 2.434797694487723, "learning_rate": 2.9645768533555387e-05, "loss": 1.9226, "step": 96 }, { "epoch": 0.3567567567567568, "grad_norm": 2.2306447173589947, "learning_rate": 2.9623377693880123e-05, "loss": 1.9273, "step": 99 }, { "epoch": 0.3675675675675676, "grad_norm": 2.4678527947600046, "learning_rate": 2.9600309698207435e-05, "loss": 1.8761, "step": 102 }, { "epoch": 0.3783783783783784, "grad_norm": 2.172069501103823, "learning_rate": 2.957656561473319e-05, "loss": 1.9297, "step": 105 }, { "epoch": 0.3891891891891892, "grad_norm": 2.009177875410658, "learning_rate": 2.955214654296045e-05, "loss": 1.8419, "step": 108 }, { "epoch": 0.4, "grad_norm": 2.8950203407246216, "learning_rate": 2.952705361364855e-05, "loss": 1.9594, "step": 111 }, { "epoch": 0.41081081081081083, "grad_norm": 2.5653163311951106, "learning_rate": 2.950128798876075e-05, "loss": 1.8869, "step": 114 }, { "epoch": 0.42162162162162165, "grad_norm": 2.732827590601958, "learning_rate": 2.947485086141042e-05, "loss": 1.8974, "step": 117 }, { "epoch": 0.43243243243243246, "grad_norm": 2.144285276139079, "learning_rate": 2.9447743455805793e-05, "loss": 1.9196, "step": 120 }, { "epoch": 0.44324324324324327, "grad_norm": 2.4409981534782914, "learning_rate": 2.9419967027193267e-05, "loss": 1.9428, "step": 123 }, { "epoch": 0.4540540540540541, "grad_norm": 2.646738039623452, "learning_rate": 2.9391522861799298e-05, "loss": 1.9737, "step": 126 }, { "epoch": 0.4648648648648649, "grad_norm": 2.3721501035510233, "learning_rate": 2.9362412276770833e-05, "loss": 1.9554, "step": 129 }, { "epoch": 0.4756756756756757, "grad_norm": 2.423435489971938, "learning_rate": 2.93326366201143e-05, "loss": 1.9787, "step": 132 }, { "epoch": 0.4864864864864865, "grad_norm": 2.207859877546887, "learning_rate": 2.9302197270633207e-05, "loss": 1.9259, "step": 135 }, { "epoch": 0.4972972972972973, "grad_norm": 2.4856913311288733, "learning_rate": 2.9271095637864295e-05, "loss": 1.9433, "step": 138 }, { "epoch": 0.5081081081081081, "grad_norm": 2.440879046484796, "learning_rate": 2.9239333162012256e-05, "loss": 1.8939, "step": 141 }, { "epoch": 0.518918918918919, "grad_norm": 2.45798549198192, "learning_rate": 2.9206911313883037e-05, "loss": 1.9845, "step": 144 }, { "epoch": 0.5297297297297298, "grad_norm": 2.3336768971980706, "learning_rate": 2.9173831594815768e-05, "loss": 1.916, "step": 147 }, { "epoch": 0.5405405405405406, "grad_norm": 2.39094175177689, "learning_rate": 2.9140095536613182e-05, "loss": 1.8494, "step": 150 }, { "epoch": 0.5513513513513514, "grad_norm": 2.429522504872439, "learning_rate": 2.9105704701470744e-05, "loss": 1.9189, "step": 153 }, { "epoch": 0.5621621621621622, "grad_norm": 2.7779327685196957, "learning_rate": 2.907066068190426e-05, "loss": 2.027, "step": 156 }, { "epoch": 0.572972972972973, "grad_norm": 2.4829217011923106, "learning_rate": 2.903496510067618e-05, "loss": 1.9414, "step": 159 }, { "epoch": 0.5837837837837838, "grad_norm": 2.307375658362859, "learning_rate": 2.899861961072041e-05, "loss": 1.9, "step": 162 }, { "epoch": 0.5945945945945946, "grad_norm": 2.5317691911892855, "learning_rate": 2.896162589506579e-05, "loss": 1.9359, "step": 165 }, { "epoch": 0.6054054054054054, "grad_norm": 2.4918163002128138, "learning_rate": 2.8923985666758178e-05, "loss": 1.8599, "step": 168 }, { "epoch": 0.6162162162162163, "grad_norm": 2.076258038368402, "learning_rate": 2.888570066878109e-05, "loss": 1.9127, "step": 171 }, { "epoch": 0.6270270270270271, "grad_norm": 2.185928367527268, "learning_rate": 2.884677267397502e-05, "loss": 1.8128, "step": 174 }, { "epoch": 0.6378378378378379, "grad_norm": 2.866856917703555, "learning_rate": 2.88072034849553e-05, "loss": 1.9267, "step": 177 }, { "epoch": 0.6486486486486487, "grad_norm": 2.403239863727176, "learning_rate": 2.8766994934028697e-05, "loss": 1.9034, "step": 180 }, { "epoch": 0.6594594594594595, "grad_norm": 2.156072962852351, "learning_rate": 2.8726148883108505e-05, "loss": 1.9516, "step": 183 }, { "epoch": 0.6702702702702703, "grad_norm": 2.511207967261495, "learning_rate": 2.868466722362836e-05, "loss": 1.8811, "step": 186 }, { "epoch": 0.6810810810810811, "grad_norm": 2.3602190597776467, "learning_rate": 2.8642551876454625e-05, "loss": 1.9503, "step": 189 }, { "epoch": 0.6918918918918919, "grad_norm": 2.482488878028461, "learning_rate": 2.8599804791797483e-05, "loss": 1.8807, "step": 192 }, { "epoch": 0.7027027027027027, "grad_norm": 2.587845075067827, "learning_rate": 2.8556427949120587e-05, "loss": 1.9359, "step": 195 }, { "epoch": 0.7135135135135136, "grad_norm": 2.209476666719861, "learning_rate": 2.851242335704943e-05, "loss": 1.898, "step": 198 }, { "epoch": 0.7243243243243244, "grad_norm": 2.419802615785546, "learning_rate": 2.8467793053278318e-05, "loss": 1.8444, "step": 201 }, { "epoch": 0.7351351351351352, "grad_norm": 2.226045413120279, "learning_rate": 2.842253910447601e-05, "loss": 1.8982, "step": 204 }, { "epoch": 0.745945945945946, "grad_norm": 2.44484302970035, "learning_rate": 2.837666360619002e-05, "loss": 1.9596, "step": 207 }, { "epoch": 0.7567567567567568, "grad_norm": 2.43331348143749, "learning_rate": 2.8330168682749594e-05, "loss": 1.9313, "step": 210 }, { "epoch": 0.7675675675675676, "grad_norm": 2.0567115700480247, "learning_rate": 2.8283056487167313e-05, "loss": 1.9314, "step": 213 }, { "epoch": 0.7783783783783784, "grad_norm": 2.3852531892957223, "learning_rate": 2.8235329201039424e-05, "loss": 1.8631, "step": 216 }, { "epoch": 0.7891891891891892, "grad_norm": 2.136466132414422, "learning_rate": 2.8186989034444794e-05, "loss": 1.859, "step": 219 }, { "epoch": 0.8, "grad_norm": 2.592701276826683, "learning_rate": 2.8138038225842577e-05, "loss": 1.96, "step": 222 }, { "epoch": 0.8108108108108109, "grad_norm": 2.0512819357757, "learning_rate": 2.808847904196857e-05, "loss": 1.8646, "step": 225 }, { "epoch": 0.8216216216216217, "grad_norm": 2.1496899989713767, "learning_rate": 2.8038313777730237e-05, "loss": 1.8924, "step": 228 }, { "epoch": 0.8324324324324325, "grad_norm": 2.150493531375372, "learning_rate": 2.798754475610044e-05, "loss": 1.7991, "step": 231 }, { "epoch": 0.8432432432432433, "grad_norm": 2.094639871928105, "learning_rate": 2.7936174328009864e-05, "loss": 1.9364, "step": 234 }, { "epoch": 0.8540540540540541, "grad_norm": 2.3155422463173623, "learning_rate": 2.7884204872238182e-05, "loss": 1.8647, "step": 237 }, { "epoch": 0.8648648648648649, "grad_norm": 2.2606435766654234, "learning_rate": 2.7831638795303873e-05, "loss": 1.8224, "step": 240 }, { "epoch": 0.8756756756756757, "grad_norm": 2.2579963330786774, "learning_rate": 2.7778478531352795e-05, "loss": 1.8282, "step": 243 }, { "epoch": 0.8864864864864865, "grad_norm": 1.9817080023633638, "learning_rate": 2.7724726542045463e-05, "loss": 1.8818, "step": 246 }, { "epoch": 0.8972972972972973, "grad_norm": 2.24441068506298, "learning_rate": 2.7670385316443084e-05, "loss": 1.9305, "step": 249 }, { "epoch": 0.9081081081081082, "grad_norm": 2.541165407563793, "learning_rate": 2.7615457370892257e-05, "loss": 1.8736, "step": 252 }, { "epoch": 0.918918918918919, "grad_norm": 1.9510813262964137, "learning_rate": 2.7559945248908468e-05, "loss": 1.8999, "step": 255 }, { "epoch": 0.9297297297297298, "grad_norm": 2.4719694412540245, "learning_rate": 2.7503851521058333e-05, "loss": 1.8846, "step": 258 }, { "epoch": 0.9405405405405406, "grad_norm": 2.3098919918683096, "learning_rate": 2.744717878484053e-05, "loss": 1.8999, "step": 261 }, { "epoch": 0.9513513513513514, "grad_norm": 2.465948183249405, "learning_rate": 2.7389929664565523e-05, "loss": 1.8028, "step": 264 }, { "epoch": 0.9621621621621622, "grad_norm": 2.204108562575812, "learning_rate": 2.733210681123406e-05, "loss": 1.9526, "step": 267 }, { "epoch": 0.972972972972973, "grad_norm": 2.3661976037384815, "learning_rate": 2.7273712902414396e-05, "loss": 1.8472, "step": 270 }, { "epoch": 0.9837837837837838, "grad_norm": 2.1596420136763137, "learning_rate": 2.7214750642118315e-05, "loss": 1.849, "step": 273 }, { "epoch": 0.9945945945945946, "grad_norm": 2.1642232408896453, "learning_rate": 2.715522276067591e-05, "loss": 1.8476, "step": 276 }, { "epoch": 1.0054054054054054, "grad_norm": 2.102003078375359, "learning_rate": 2.709513201460915e-05, "loss": 1.6092, "step": 279 }, { "epoch": 1.0162162162162163, "grad_norm": 2.580181780728641, "learning_rate": 2.7034481186504253e-05, "loss": 1.3409, "step": 282 }, { "epoch": 1.027027027027027, "grad_norm": 1.987111465217205, "learning_rate": 2.6973273084882802e-05, "loss": 1.3026, "step": 285 }, { "epoch": 1.037837837837838, "grad_norm": 1.835737689431998, "learning_rate": 2.691151054407172e-05, "loss": 1.2992, "step": 288 }, { "epoch": 1.0486486486486486, "grad_norm": 2.075134750486952, "learning_rate": 2.684919642407202e-05, "loss": 1.2751, "step": 291 }, { "epoch": 1.0594594594594595, "grad_norm": 2.154847674569881, "learning_rate": 2.6786333610426353e-05, "loss": 1.2951, "step": 294 }, { "epoch": 1.0702702702702702, "grad_norm": 2.201259061666919, "learning_rate": 2.67229250140854e-05, "loss": 1.2813, "step": 297 }, { "epoch": 1.0810810810810811, "grad_norm": 2.099199218269766, "learning_rate": 2.6658973571273077e-05, "loss": 1.2422, "step": 300 }, { "epoch": 1.0918918918918918, "grad_norm": 2.1682257578187185, "learning_rate": 2.6594482243350558e-05, "loss": 1.2958, "step": 303 }, { "epoch": 1.1027027027027028, "grad_norm": 1.9450085810744047, "learning_rate": 2.6529454016679175e-05, "loss": 1.2175, "step": 306 }, { "epoch": 1.1135135135135135, "grad_norm": 1.9627021282761483, "learning_rate": 2.6463891902482087e-05, "loss": 1.2143, "step": 309 }, { "epoch": 1.1243243243243244, "grad_norm": 2.0335479254484716, "learning_rate": 2.639779893670487e-05, "loss": 1.2425, "step": 312 }, { "epoch": 1.135135135135135, "grad_norm": 2.004046748177982, "learning_rate": 2.6331178179874934e-05, "loss": 1.2834, "step": 315 }, { "epoch": 1.145945945945946, "grad_norm": 2.1051127397939333, "learning_rate": 2.6264032716959778e-05, "loss": 1.2787, "step": 318 }, { "epoch": 1.1567567567567567, "grad_norm": 1.941259889065534, "learning_rate": 2.6196365657224166e-05, "loss": 1.2456, "step": 321 }, { "epoch": 1.1675675675675676, "grad_norm": 2.4331557433084643, "learning_rate": 2.612818013408613e-05, "loss": 1.2398, "step": 324 }, { "epoch": 1.1783783783783783, "grad_norm": 2.125511278222413, "learning_rate": 2.6059479304971867e-05, "loss": 1.2717, "step": 327 }, { "epoch": 1.1891891891891893, "grad_norm": 2.254173095275699, "learning_rate": 2.5990266351169554e-05, "loss": 1.2694, "step": 330 }, { "epoch": 1.2, "grad_norm": 1.9756822137444563, "learning_rate": 2.5920544477682012e-05, "loss": 1.2747, "step": 333 }, { "epoch": 1.2108108108108109, "grad_norm": 2.1110444161525077, "learning_rate": 2.5850316913078298e-05, "loss": 1.295, "step": 336 }, { "epoch": 1.2216216216216216, "grad_norm": 2.1222020915135995, "learning_rate": 2.5779586909344206e-05, "loss": 1.3109, "step": 339 }, { "epoch": 1.2324324324324325, "grad_norm": 1.8417665825926723, "learning_rate": 2.570835774173169e-05, "loss": 1.3056, "step": 342 }, { "epoch": 1.2432432432432432, "grad_norm": 2.0280022835044016, "learning_rate": 2.563663270860717e-05, "loss": 1.32, "step": 345 }, { "epoch": 1.2540540540540541, "grad_norm": 2.025208781667826, "learning_rate": 2.5564415131298824e-05, "loss": 1.2705, "step": 348 }, { "epoch": 1.2648648648648648, "grad_norm": 1.9120753635362695, "learning_rate": 2.5491708353942773e-05, "loss": 1.2645, "step": 351 }, { "epoch": 1.2756756756756757, "grad_norm": 1.9983727272207963, "learning_rate": 2.5418515743328232e-05, "loss": 1.2795, "step": 354 }, { "epoch": 1.2864864864864864, "grad_norm": 2.0065772123507735, "learning_rate": 2.534484068874162e-05, "loss": 1.3017, "step": 357 }, { "epoch": 1.2972972972972974, "grad_norm": 2.0474786433390335, "learning_rate": 2.5270686601809577e-05, "loss": 1.25, "step": 360 }, { "epoch": 1.308108108108108, "grad_norm": 1.9384897745280192, "learning_rate": 2.5196056916341016e-05, "loss": 1.2294, "step": 363 }, { "epoch": 1.318918918918919, "grad_norm": 1.9918888965378558, "learning_rate": 2.512095508816812e-05, "loss": 1.2941, "step": 366 }, { "epoch": 1.3297297297297297, "grad_norm": 1.9846908820959408, "learning_rate": 2.5045384594986285e-05, "loss": 1.2538, "step": 369 }, { "epoch": 1.3405405405405406, "grad_norm": 2.080792691569016, "learning_rate": 2.4969348936193102e-05, "loss": 1.2543, "step": 372 }, { "epoch": 1.3513513513513513, "grad_norm": 2.0125722435554514, "learning_rate": 2.4892851632726306e-05, "loss": 1.2757, "step": 375 }, { "epoch": 1.3621621621621622, "grad_norm": 1.9764284092403845, "learning_rate": 2.481589622690075e-05, "loss": 1.2625, "step": 378 }, { "epoch": 1.372972972972973, "grad_norm": 2.027142138762262, "learning_rate": 2.4738486282244333e-05, "loss": 1.2831, "step": 381 }, { "epoch": 1.3837837837837839, "grad_norm": 1.9798502988269455, "learning_rate": 2.4660625383333028e-05, "loss": 1.2673, "step": 384 }, { "epoch": 1.3945945945945946, "grad_norm": 2.047403010408032, "learning_rate": 2.4582317135624886e-05, "loss": 1.2698, "step": 387 }, { "epoch": 1.4054054054054055, "grad_norm": 2.012366168035498, "learning_rate": 2.450356516529304e-05, "loss": 1.3192, "step": 390 }, { "epoch": 1.4162162162162162, "grad_norm": 2.05238416440222, "learning_rate": 2.4424373119057852e-05, "loss": 1.2696, "step": 393 }, { "epoch": 1.427027027027027, "grad_norm": 2.2479201479489923, "learning_rate": 2.4344744664018e-05, "loss": 1.3024, "step": 396 }, { "epoch": 1.4378378378378378, "grad_norm": 2.1894458789904583, "learning_rate": 2.4264683487480687e-05, "loss": 1.3099, "step": 399 }, { "epoch": 1.4486486486486487, "grad_norm": 2.037077094665106, "learning_rate": 2.4184193296790887e-05, "loss": 1.2514, "step": 402 }, { "epoch": 1.4594594594594594, "grad_norm": 2.0273525133084087, "learning_rate": 2.410327781915969e-05, "loss": 1.2798, "step": 405 }, { "epoch": 1.4702702702702704, "grad_norm": 2.1689507381607713, "learning_rate": 2.402194080149167e-05, "loss": 1.3066, "step": 408 }, { "epoch": 1.481081081081081, "grad_norm": 1.9503795898614715, "learning_rate": 2.394018601021143e-05, "loss": 1.2582, "step": 411 }, { "epoch": 1.491891891891892, "grad_norm": 2.2761809415998706, "learning_rate": 2.385801723108914e-05, "loss": 1.2981, "step": 414 }, { "epoch": 1.5027027027027027, "grad_norm": 2.0356459303417296, "learning_rate": 2.3775438269065277e-05, "loss": 1.2505, "step": 417 }, { "epoch": 1.5135135135135136, "grad_norm": 1.9297970402246538, "learning_rate": 2.3692452948074395e-05, "loss": 1.2546, "step": 420 }, { "epoch": 1.5243243243243243, "grad_norm": 2.0123397178971714, "learning_rate": 2.360906511086809e-05, "loss": 1.2571, "step": 423 }, { "epoch": 1.535135135135135, "grad_norm": 2.3184894064368033, "learning_rate": 2.352527861883702e-05, "loss": 1.2625, "step": 426 }, { "epoch": 1.545945945945946, "grad_norm": 1.936734476547218, "learning_rate": 2.3441097351832113e-05, "loss": 1.3054, "step": 429 }, { "epoch": 1.5567567567567568, "grad_norm": 2.030353788212261, "learning_rate": 2.3356525207984916e-05, "loss": 1.2755, "step": 432 }, { "epoch": 1.5675675675675675, "grad_norm": 2.039094018045801, "learning_rate": 2.3271566103527063e-05, "loss": 1.2686, "step": 435 }, { "epoch": 1.5783783783783782, "grad_norm": 1.9514327297087253, "learning_rate": 2.318622397260896e-05, "loss": 1.2683, "step": 438 }, { "epoch": 1.5891891891891892, "grad_norm": 1.8736269848724938, "learning_rate": 2.3100502767117566e-05, "loss": 1.2255, "step": 441 }, { "epoch": 1.6, "grad_norm": 1.9037413275565993, "learning_rate": 2.301440645649344e-05, "loss": 1.2669, "step": 444 }, { "epoch": 1.6108108108108108, "grad_norm": 2.172424343159148, "learning_rate": 2.2927939027546895e-05, "loss": 1.2601, "step": 447 }, { "epoch": 1.6216216216216215, "grad_norm": 2.1709165438362152, "learning_rate": 2.284110448427341e-05, "loss": 1.2992, "step": 450 }, { "epoch": 1.6324324324324324, "grad_norm": 1.9517268177516773, "learning_rate": 2.2753906847668197e-05, "loss": 1.2602, "step": 453 }, { "epoch": 1.6432432432432433, "grad_norm": 2.142716687001655, "learning_rate": 2.266635015554002e-05, "loss": 1.2387, "step": 456 }, { "epoch": 1.654054054054054, "grad_norm": 1.892430237031948, "learning_rate": 2.2578438462324214e-05, "loss": 1.2796, "step": 459 }, { "epoch": 1.6648648648648647, "grad_norm": 1.9831414889207626, "learning_rate": 2.2490175838894928e-05, "loss": 1.2693, "step": 462 }, { "epoch": 1.6756756756756757, "grad_norm": 1.968788091532238, "learning_rate": 2.2401566372376635e-05, "loss": 1.2826, "step": 465 }, { "epoch": 1.6864864864864866, "grad_norm": 2.028829960671371, "learning_rate": 2.231261416595486e-05, "loss": 1.2412, "step": 468 }, { "epoch": 1.6972972972972973, "grad_norm": 2.0761836559826126, "learning_rate": 2.222332333868618e-05, "loss": 1.2907, "step": 471 }, { "epoch": 1.708108108108108, "grad_norm": 2.0505303769583714, "learning_rate": 2.2133698025307487e-05, "loss": 1.2164, "step": 474 }, { "epoch": 1.718918918918919, "grad_norm": 1.9892599771865245, "learning_rate": 2.2043742376044507e-05, "loss": 1.3029, "step": 477 }, { "epoch": 1.7297297297297298, "grad_norm": 2.056680185472525, "learning_rate": 2.195346055641966e-05, "loss": 1.2532, "step": 480 }, { "epoch": 1.7405405405405405, "grad_norm": 2.020031485543879, "learning_rate": 2.186285674705911e-05, "loss": 1.2752, "step": 483 }, { "epoch": 1.7513513513513512, "grad_norm": 1.976348646730665, "learning_rate": 2.1771935143499233e-05, "loss": 1.281, "step": 486 }, { "epoch": 1.7621621621621621, "grad_norm": 1.9671362390239506, "learning_rate": 2.1680699955992295e-05, "loss": 1.2567, "step": 489 }, { "epoch": 1.772972972972973, "grad_norm": 1.9862314788875317, "learning_rate": 2.1589155409311514e-05, "loss": 1.2722, "step": 492 }, { "epoch": 1.7837837837837838, "grad_norm": 1.8794017228975768, "learning_rate": 2.1497305742555416e-05, "loss": 1.2267, "step": 495 }, { "epoch": 1.7945945945945945, "grad_norm": 1.8537400724418822, "learning_rate": 2.140515520895154e-05, "loss": 1.2856, "step": 498 }, { "epoch": 1.8054054054054054, "grad_norm": 2.2140372302917424, "learning_rate": 2.131270807565948e-05, "loss": 1.2668, "step": 501 }, { "epoch": 1.8162162162162163, "grad_norm": 2.1224010879465514, "learning_rate": 2.1219968623573292e-05, "loss": 1.3403, "step": 504 }, { "epoch": 1.827027027027027, "grad_norm": 2.150717671959958, "learning_rate": 2.1126941147123285e-05, "loss": 1.3294, "step": 507 }, { "epoch": 1.8378378378378377, "grad_norm": 1.9412785818269171, "learning_rate": 2.1033629954077123e-05, "loss": 1.298, "step": 510 }, { "epoch": 1.8486486486486486, "grad_norm": 1.9061258724957593, "learning_rate": 2.0940039365340363e-05, "loss": 1.2984, "step": 513 }, { "epoch": 1.8594594594594596, "grad_norm": 1.91000429693783, "learning_rate": 2.0846173714756372e-05, "loss": 1.2541, "step": 516 }, { "epoch": 1.8702702702702703, "grad_norm": 2.0253367045491957, "learning_rate": 2.0752037348905656e-05, "loss": 1.3045, "step": 519 }, { "epoch": 1.881081081081081, "grad_norm": 2.099866673823967, "learning_rate": 2.0657634626904544e-05, "loss": 1.2841, "step": 522 }, { "epoch": 1.8918918918918919, "grad_norm": 2.053590437534557, "learning_rate": 2.056296992020339e-05, "loss": 1.2732, "step": 525 }, { "epoch": 1.9027027027027028, "grad_norm": 2.037664067587095, "learning_rate": 2.046804761238409e-05, "loss": 1.2661, "step": 528 }, { "epoch": 1.9135135135135135, "grad_norm": 2.1032576901560875, "learning_rate": 2.037287209895713e-05, "loss": 1.2815, "step": 531 }, { "epoch": 1.9243243243243242, "grad_norm": 2.0365409183820393, "learning_rate": 2.0277447787158057e-05, "loss": 1.281, "step": 534 }, { "epoch": 1.9351351351351351, "grad_norm": 1.9303661726886707, "learning_rate": 2.0181779095743335e-05, "loss": 1.3122, "step": 537 }, { "epoch": 1.945945945945946, "grad_norm": 2.0989107332254164, "learning_rate": 2.008587045478581e-05, "loss": 1.2766, "step": 540 }, { "epoch": 1.9567567567567568, "grad_norm": 2.2610013717867607, "learning_rate": 1.9989726305469497e-05, "loss": 1.2744, "step": 543 }, { "epoch": 1.9675675675675675, "grad_norm": 1.9759438397315554, "learning_rate": 1.989335109988397e-05, "loss": 1.2821, "step": 546 }, { "epoch": 1.9783783783783784, "grad_norm": 2.0630705322166185, "learning_rate": 1.9796749300818185e-05, "loss": 1.2964, "step": 549 }, { "epoch": 1.9891891891891893, "grad_norm": 2.062855518861247, "learning_rate": 1.9699925381553824e-05, "loss": 1.3101, "step": 552 }, { "epoch": 2.0, "grad_norm": 2.004529311768417, "learning_rate": 1.960288382565816e-05, "loss": 1.2436, "step": 555 }, { "epoch": 2.0108108108108107, "grad_norm": 1.8731442249653363, "learning_rate": 1.9505629126776435e-05, "loss": 0.7428, "step": 558 }, { "epoch": 2.0216216216216214, "grad_norm": 2.7743566111060356, "learning_rate": 1.9408165788423776e-05, "loss": 0.6521, "step": 561 }, { "epoch": 2.0324324324324325, "grad_norm": 2.4263480281287326, "learning_rate": 1.9310498323776642e-05, "loss": 0.6719, "step": 564 }, { "epoch": 2.0432432432432432, "grad_norm": 1.7971900573296662, "learning_rate": 1.9212631255463864e-05, "loss": 0.6507, "step": 567 }, { "epoch": 2.054054054054054, "grad_norm": 1.9536502652307774, "learning_rate": 1.911456911535719e-05, "loss": 0.6713, "step": 570 }, { "epoch": 2.064864864864865, "grad_norm": 1.6734389443783486, "learning_rate": 1.9016316444361443e-05, "loss": 0.6513, "step": 573 }, { "epoch": 2.075675675675676, "grad_norm": 2.0889794055853446, "learning_rate": 1.8917877792204238e-05, "loss": 0.6391, "step": 576 }, { "epoch": 2.0864864864864865, "grad_norm": 1.952453907013592, "learning_rate": 1.881925771722533e-05, "loss": 0.6278, "step": 579 }, { "epoch": 2.097297297297297, "grad_norm": 1.9394182987242035, "learning_rate": 1.872046078616549e-05, "loss": 0.6268, "step": 582 }, { "epoch": 2.108108108108108, "grad_norm": 1.7760412655871423, "learning_rate": 1.862149157395506e-05, "loss": 0.6217, "step": 585 }, { "epoch": 2.118918918918919, "grad_norm": 1.697555380465673, "learning_rate": 1.852235466350212e-05, "loss": 0.6496, "step": 588 }, { "epoch": 2.1297297297297297, "grad_norm": 1.9579376565670537, "learning_rate": 1.8423054645480228e-05, "loss": 0.6388, "step": 591 }, { "epoch": 2.1405405405405404, "grad_norm": 2.1761144428281565, "learning_rate": 1.8323596118115882e-05, "loss": 0.6293, "step": 594 }, { "epoch": 2.1513513513513516, "grad_norm": 1.7805116699629946, "learning_rate": 1.8223983686975576e-05, "loss": 0.6321, "step": 597 }, { "epoch": 2.1621621621621623, "grad_norm": 2.0913504208478773, "learning_rate": 1.8124221964752535e-05, "loss": 0.6312, "step": 600 }, { "epoch": 2.172972972972973, "grad_norm": 1.7500638235123256, "learning_rate": 1.80243155710531e-05, "loss": 0.6217, "step": 603 }, { "epoch": 2.1837837837837837, "grad_norm": 1.8271358171780696, "learning_rate": 1.7924269132182855e-05, "loss": 0.6711, "step": 606 }, { "epoch": 2.1945945945945944, "grad_norm": 1.9546793256337727, "learning_rate": 1.782408728093235e-05, "loss": 0.6392, "step": 609 }, { "epoch": 2.2054054054054055, "grad_norm": 1.7882856434212824, "learning_rate": 1.7723774656362602e-05, "loss": 0.6395, "step": 612 }, { "epoch": 2.2162162162162162, "grad_norm": 1.7898241991442447, "learning_rate": 1.762333590359028e-05, "loss": 0.6521, "step": 615 }, { "epoch": 2.227027027027027, "grad_norm": 1.6654897277159277, "learning_rate": 1.752277567357258e-05, "loss": 0.646, "step": 618 }, { "epoch": 2.237837837837838, "grad_norm": 1.8175887991140565, "learning_rate": 1.7422098622891873e-05, "loss": 0.613, "step": 621 }, { "epoch": 2.2486486486486488, "grad_norm": 2.208881509880086, "learning_rate": 1.7321309413540087e-05, "loss": 0.6375, "step": 624 }, { "epoch": 2.2594594594594595, "grad_norm": 1.8520498287505407, "learning_rate": 1.722041271270281e-05, "loss": 0.6613, "step": 627 }, { "epoch": 2.27027027027027, "grad_norm": 2.0512133133202988, "learning_rate": 1.7119413192543165e-05, "loss": 0.6292, "step": 630 }, { "epoch": 2.281081081081081, "grad_norm": 1.773677981324912, "learning_rate": 1.701831552998548e-05, "loss": 0.6399, "step": 633 }, { "epoch": 2.291891891891892, "grad_norm": 1.8469844530131443, "learning_rate": 1.6917124406498697e-05, "loss": 0.6622, "step": 636 }, { "epoch": 2.3027027027027027, "grad_norm": 2.0663988043195265, "learning_rate": 1.68158445078796e-05, "loss": 0.6428, "step": 639 }, { "epoch": 2.3135135135135134, "grad_norm": 1.7259116146942584, "learning_rate": 1.671448052403583e-05, "loss": 0.6528, "step": 642 }, { "epoch": 2.3243243243243246, "grad_norm": 1.9046842309367298, "learning_rate": 1.6613037148768702e-05, "loss": 0.6619, "step": 645 }, { "epoch": 2.3351351351351353, "grad_norm": 1.8564656298207483, "learning_rate": 1.6511519079555887e-05, "loss": 0.6665, "step": 648 }, { "epoch": 2.345945945945946, "grad_norm": 1.8614483941828683, "learning_rate": 1.640993101733383e-05, "loss": 0.6494, "step": 651 }, { "epoch": 2.3567567567567567, "grad_norm": 1.8411704889596792, "learning_rate": 1.6308277666280133e-05, "loss": 0.6286, "step": 654 }, { "epoch": 2.3675675675675674, "grad_norm": 1.810238547613111, "learning_rate": 1.6206563733595666e-05, "loss": 0.6544, "step": 657 }, { "epoch": 2.3783783783783785, "grad_norm": 1.803680767305892, "learning_rate": 1.610479392928663e-05, "loss": 0.6449, "step": 660 }, { "epoch": 2.389189189189189, "grad_norm": 1.7701921715064215, "learning_rate": 1.600297296594643e-05, "loss": 0.6604, "step": 663 }, { "epoch": 2.4, "grad_norm": 1.6556840871349028, "learning_rate": 1.5901105558537472e-05, "loss": 0.6775, "step": 666 }, { "epoch": 2.410810810810811, "grad_norm": 1.734110836750224, "learning_rate": 1.579919642417281e-05, "loss": 0.6482, "step": 669 }, { "epoch": 2.4216216216216218, "grad_norm": 2.000655482106033, "learning_rate": 1.569725028189772e-05, "loss": 0.648, "step": 672 }, { "epoch": 2.4324324324324325, "grad_norm": 1.7669642313010707, "learning_rate": 1.5595271852471204e-05, "loss": 0.6548, "step": 675 }, { "epoch": 2.443243243243243, "grad_norm": 1.7114555298061667, "learning_rate": 1.5493265858147335e-05, "loss": 0.6291, "step": 678 }, { "epoch": 2.454054054054054, "grad_norm": 1.982145197600375, "learning_rate": 1.5391237022456636e-05, "loss": 0.6648, "step": 681 }, { "epoch": 2.464864864864865, "grad_norm": 2.057868578472174, "learning_rate": 1.5289190069987332e-05, "loss": 0.652, "step": 684 }, { "epoch": 2.4756756756756757, "grad_norm": 1.6966492734095149, "learning_rate": 1.5187129726166565e-05, "loss": 0.6524, "step": 687 }, { "epoch": 2.4864864864864864, "grad_norm": 1.7126379411947121, "learning_rate": 1.5085060717041585e-05, "loss": 0.6691, "step": 690 }, { "epoch": 2.4972972972972975, "grad_norm": 2.027436021235288, "learning_rate": 1.4982987769060898e-05, "loss": 0.6551, "step": 693 }, { "epoch": 2.5081081081081082, "grad_norm": 1.856345058930321, "learning_rate": 1.4880915608855402e-05, "loss": 0.6596, "step": 696 }, { "epoch": 2.518918918918919, "grad_norm": 1.8317958549352111, "learning_rate": 1.477884896301953e-05, "loss": 0.6283, "step": 699 }, { "epoch": 2.5297297297297296, "grad_norm": 1.7233177960794333, "learning_rate": 1.467679255789234e-05, "loss": 0.659, "step": 702 }, { "epoch": 2.5405405405405403, "grad_norm": 1.70587685226235, "learning_rate": 1.4574751119338703e-05, "loss": 0.6375, "step": 705 }, { "epoch": 2.5513513513513515, "grad_norm": 1.8655585555708385, "learning_rate": 1.4472729372530432e-05, "loss": 0.6242, "step": 708 }, { "epoch": 2.562162162162162, "grad_norm": 1.8001992070479, "learning_rate": 1.4370732041727495e-05, "loss": 0.643, "step": 711 }, { "epoch": 2.572972972972973, "grad_norm": 1.848601528720487, "learning_rate": 1.426876385005922e-05, "loss": 0.653, "step": 714 }, { "epoch": 2.583783783783784, "grad_norm": 1.7058353790178475, "learning_rate": 1.4166829519305628e-05, "loss": 0.6189, "step": 717 }, { "epoch": 2.5945945945945947, "grad_norm": 1.7284279167710883, "learning_rate": 1.406493376967876e-05, "loss": 0.6447, "step": 720 }, { "epoch": 2.6054054054054054, "grad_norm": 1.9360862264128509, "learning_rate": 1.396308131960409e-05, "loss": 0.6396, "step": 723 }, { "epoch": 2.616216216216216, "grad_norm": 1.6785593995242742, "learning_rate": 1.386127688550206e-05, "loss": 0.6305, "step": 726 }, { "epoch": 2.627027027027027, "grad_norm": 1.7901370985866867, "learning_rate": 1.3759525181569663e-05, "loss": 0.6379, "step": 729 }, { "epoch": 2.637837837837838, "grad_norm": 1.7497083646634908, "learning_rate": 1.3657830919562151e-05, "loss": 0.6252, "step": 732 }, { "epoch": 2.6486486486486487, "grad_norm": 1.8186530224800674, "learning_rate": 1.3556198808574828e-05, "loss": 0.6751, "step": 735 }, { "epoch": 2.6594594594594594, "grad_norm": 1.7289128215137377, "learning_rate": 1.3454633554825029e-05, "loss": 0.6467, "step": 738 }, { "epoch": 2.6702702702702705, "grad_norm": 1.7639526554500256, "learning_rate": 1.335313986143416e-05, "loss": 0.6166, "step": 741 }, { "epoch": 2.6810810810810812, "grad_norm": 1.7186641679091503, "learning_rate": 1.3251722428209933e-05, "loss": 0.6845, "step": 744 }, { "epoch": 2.691891891891892, "grad_norm": 1.6965540327042066, "learning_rate": 1.3150385951428714e-05, "loss": 0.6487, "step": 747 }, { "epoch": 2.7027027027027026, "grad_norm": 1.8567952974204969, "learning_rate": 1.3049135123618073e-05, "loss": 0.6457, "step": 750 }, { "epoch": 2.7135135135135133, "grad_norm": 1.6295081249921362, "learning_rate": 1.2947974633339499e-05, "loss": 0.6445, "step": 753 }, { "epoch": 2.7243243243243245, "grad_norm": 1.7231568913626358, "learning_rate": 1.2846909164971244e-05, "loss": 0.6434, "step": 756 }, { "epoch": 2.735135135135135, "grad_norm": 1.8455193518645283, "learning_rate": 1.2745943398491462e-05, "loss": 0.65, "step": 759 }, { "epoch": 2.745945945945946, "grad_norm": 1.827106966674069, "learning_rate": 1.2645082009261468e-05, "loss": 0.6628, "step": 762 }, { "epoch": 2.756756756756757, "grad_norm": 1.7633688689382077, "learning_rate": 1.254432966780924e-05, "loss": 0.6491, "step": 765 }, { "epoch": 2.7675675675675677, "grad_norm": 1.7284101689900349, "learning_rate": 1.2443691039613128e-05, "loss": 0.6258, "step": 768 }, { "epoch": 2.7783783783783784, "grad_norm": 1.87274147264611, "learning_rate": 1.2343170784885859e-05, "loss": 0.6476, "step": 771 }, { "epoch": 2.789189189189189, "grad_norm": 1.6928834129022787, "learning_rate": 1.2242773558358701e-05, "loss": 0.638, "step": 774 }, { "epoch": 2.8, "grad_norm": 1.7838050014292766, "learning_rate": 1.2142504009065914e-05, "loss": 0.6402, "step": 777 }, { "epoch": 2.810810810810811, "grad_norm": 1.7473000738048472, "learning_rate": 1.2042366780129507e-05, "loss": 0.615, "step": 780 }, { "epoch": 2.8216216216216217, "grad_norm": 1.8638020286903605, "learning_rate": 1.1942366508544195e-05, "loss": 0.6425, "step": 783 }, { "epoch": 2.8324324324324324, "grad_norm": 1.874577580434278, "learning_rate": 1.1842507824962694e-05, "loss": 0.6504, "step": 786 }, { "epoch": 2.8432432432432435, "grad_norm": 1.755404683062205, "learning_rate": 1.1742795353481291e-05, "loss": 0.6541, "step": 789 }, { "epoch": 2.854054054054054, "grad_norm": 1.8072551565675472, "learning_rate": 1.1643233711425716e-05, "loss": 0.6683, "step": 792 }, { "epoch": 2.864864864864865, "grad_norm": 1.6667276846485022, "learning_rate": 1.1543827509137329e-05, "loss": 0.6486, "step": 795 }, { "epoch": 2.8756756756756756, "grad_norm": 1.688942735670296, "learning_rate": 1.144458134975964e-05, "loss": 0.6652, "step": 798 }, { "epoch": 2.8864864864864863, "grad_norm": 1.753840610632178, "learning_rate": 1.1345499829025136e-05, "loss": 0.6634, "step": 801 }, { "epoch": 2.8972972972972975, "grad_norm": 1.9607814129942556, "learning_rate": 1.1246587535042492e-05, "loss": 0.6426, "step": 804 }, { "epoch": 2.908108108108108, "grad_norm": 1.813335779829059, "learning_rate": 1.1147849048084105e-05, "loss": 0.6315, "step": 807 }, { "epoch": 2.918918918918919, "grad_norm": 1.7402704693393687, "learning_rate": 1.1049288940373972e-05, "loss": 0.6228, "step": 810 }, { "epoch": 2.92972972972973, "grad_norm": 1.7680419333729458, "learning_rate": 1.0950911775876014e-05, "loss": 0.6, "step": 813 }, { "epoch": 2.9405405405405407, "grad_norm": 1.843223644664368, "learning_rate": 1.0852722110082693e-05, "loss": 0.6476, "step": 816 }, { "epoch": 2.9513513513513514, "grad_norm": 1.7069834663839671, "learning_rate": 1.0754724489804098e-05, "loss": 0.6593, "step": 819 }, { "epoch": 2.962162162162162, "grad_norm": 1.734691145206824, "learning_rate": 1.0656923452957354e-05, "loss": 0.6252, "step": 822 }, { "epoch": 2.972972972972973, "grad_norm": 1.6530132045637638, "learning_rate": 1.0559323528356542e-05, "loss": 0.6218, "step": 825 }, { "epoch": 2.983783783783784, "grad_norm": 1.7766789557037792, "learning_rate": 1.0461929235502952e-05, "loss": 0.6494, "step": 828 }, { "epoch": 2.9945945945945946, "grad_norm": 1.777411804411515, "learning_rate": 1.036474508437579e-05, "loss": 0.6409, "step": 831 } ], "logging_steps": 3, "max_steps": 1385, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 833, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 151014030311424.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }