{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3665823527255398, "eval_steps": 1000, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003665823527255398, "grad_norm": 0.8778485808644284, "learning_rate": 2e-07, "loss": 2.1465, "step": 10 }, { "epoch": 0.0007331647054510796, "grad_norm": 1.0370696683685088, "learning_rate": 4e-07, "loss": 2.1972, "step": 20 }, { "epoch": 0.0010997470581766194, "grad_norm": 1.0006676078231553, "learning_rate": 6e-07, "loss": 2.1582, "step": 30 }, { "epoch": 0.0014663294109021592, "grad_norm": 0.8854477289760336, "learning_rate": 8e-07, "loss": 2.1934, "step": 40 }, { "epoch": 0.001832911763627699, "grad_norm": 0.8999727006888211, "learning_rate": 1e-06, "loss": 2.1904, "step": 50 }, { "epoch": 0.002199494116353239, "grad_norm": 0.932364223152173, "learning_rate": 9.999996672053607e-07, "loss": 2.1706, "step": 60 }, { "epoch": 0.0025660764690787785, "grad_norm": 1.0299012086021375, "learning_rate": 9.999986688218858e-07, "loss": 2.1958, "step": 70 }, { "epoch": 0.0029326588218043185, "grad_norm": 0.9395158606106717, "learning_rate": 9.999970048509042e-07, "loss": 2.2273, "step": 80 }, { "epoch": 0.003299241174529858, "grad_norm": 0.9869960358591985, "learning_rate": 9.999946752946311e-07, "loss": 2.1807, "step": 90 }, { "epoch": 0.003665823527255398, "grad_norm": 0.98825421384792, "learning_rate": 9.999916801561675e-07, "loss": 2.1348, "step": 100 }, { "epoch": 0.004032405879980938, "grad_norm": 1.1988395000442367, "learning_rate": 9.999880194395004e-07, "loss": 2.1377, "step": 110 }, { "epoch": 0.004398988232706478, "grad_norm": 1.129064025809237, "learning_rate": 9.99983693149503e-07, "loss": 2.1565, "step": 120 }, { "epoch": 0.004765570585432017, "grad_norm": 1.0050118479797396, "learning_rate": 9.999787012919342e-07, "loss": 2.1701, "step": 130 }, { "epoch": 0.005132152938157557, "grad_norm": 0.9232759625522824, "learning_rate": 9.999730438734393e-07, "loss": 2.0963, "step": 140 }, { "epoch": 0.0054987352908830965, "grad_norm": 1.0348403490845175, "learning_rate": 9.999667209015492e-07, "loss": 2.1989, "step": 150 }, { "epoch": 0.005865317643608637, "grad_norm": 1.0493408122676058, "learning_rate": 9.999597323846806e-07, "loss": 2.1707, "step": 160 }, { "epoch": 0.0062318999963341766, "grad_norm": 1.116513730433909, "learning_rate": 9.99952078332137e-07, "loss": 2.1614, "step": 170 }, { "epoch": 0.006598482349059716, "grad_norm": 0.9558367370618089, "learning_rate": 9.999437587541072e-07, "loss": 2.1214, "step": 180 }, { "epoch": 0.006965064701785256, "grad_norm": 1.0990453159310916, "learning_rate": 9.999347736616657e-07, "loss": 2.1514, "step": 190 }, { "epoch": 0.007331647054510796, "grad_norm": 1.051146838955259, "learning_rate": 9.999251230667734e-07, "loss": 2.1672, "step": 200 }, { "epoch": 0.007698229407236336, "grad_norm": 1.0528334484392676, "learning_rate": 9.99914806982277e-07, "loss": 2.1651, "step": 210 }, { "epoch": 0.008064811759961876, "grad_norm": 1.0488001209067876, "learning_rate": 9.999038254219094e-07, "loss": 2.1269, "step": 220 }, { "epoch": 0.008431394112687415, "grad_norm": 1.0423933094923075, "learning_rate": 9.998921784002884e-07, "loss": 2.1409, "step": 230 }, { "epoch": 0.008797976465412955, "grad_norm": 1.2035163212207243, "learning_rate": 9.998798659329188e-07, "loss": 2.0949, "step": 240 }, { "epoch": 0.009164558818138494, "grad_norm": 1.0311622443925152, "learning_rate": 9.998668880361902e-07, "loss": 2.1572, "step": 250 }, { "epoch": 0.009531141170864035, "grad_norm": 1.0199238986570556, "learning_rate": 9.99853244727379e-07, "loss": 2.0908, "step": 260 }, { "epoch": 0.009897723523589575, "grad_norm": 1.1052910194491554, "learning_rate": 9.998389360246465e-07, "loss": 2.1046, "step": 270 }, { "epoch": 0.010264305876315114, "grad_norm": 1.0244380828171549, "learning_rate": 9.998239619470404e-07, "loss": 2.1351, "step": 280 }, { "epoch": 0.010630888229040654, "grad_norm": 1.0080176905815665, "learning_rate": 9.998083225144936e-07, "loss": 2.089, "step": 290 }, { "epoch": 0.010997470581766193, "grad_norm": 0.9588881775099163, "learning_rate": 9.997920177478252e-07, "loss": 2.0186, "step": 300 }, { "epoch": 0.011364052934491733, "grad_norm": 1.0223619251237732, "learning_rate": 9.997750476687394e-07, "loss": 2.0966, "step": 310 }, { "epoch": 0.011730635287217274, "grad_norm": 1.1940399230837102, "learning_rate": 9.99757412299827e-07, "loss": 2.1036, "step": 320 }, { "epoch": 0.012097217639942813, "grad_norm": 0.9943487033980454, "learning_rate": 9.997391116645635e-07, "loss": 2.0628, "step": 330 }, { "epoch": 0.012463799992668353, "grad_norm": 1.03891573450971, "learning_rate": 9.997201457873104e-07, "loss": 2.0691, "step": 340 }, { "epoch": 0.012830382345393894, "grad_norm": 1.116344520158988, "learning_rate": 9.997005146933144e-07, "loss": 2.0524, "step": 350 }, { "epoch": 0.013196964698119432, "grad_norm": 0.9966017657422209, "learning_rate": 9.996802184087082e-07, "loss": 2.0779, "step": 360 }, { "epoch": 0.013563547050844973, "grad_norm": 1.0412743923430994, "learning_rate": 9.996592569605099e-07, "loss": 2.0376, "step": 370 }, { "epoch": 0.013930129403570512, "grad_norm": 1.1118998023014073, "learning_rate": 9.996376303766227e-07, "loss": 2.015, "step": 380 }, { "epoch": 0.014296711756296052, "grad_norm": 1.0325566872435106, "learning_rate": 9.996153386858355e-07, "loss": 2.0249, "step": 390 }, { "epoch": 0.014663294109021592, "grad_norm": 0.9345504257678122, "learning_rate": 9.995923819178226e-07, "loss": 2.0451, "step": 400 }, { "epoch": 0.015029876461747131, "grad_norm": 0.8875269101106378, "learning_rate": 9.995687601031435e-07, "loss": 2.0108, "step": 410 }, { "epoch": 0.015396458814472672, "grad_norm": 1.0784341870798066, "learning_rate": 9.99544473273243e-07, "loss": 2.0201, "step": 420 }, { "epoch": 0.015763041167198212, "grad_norm": 0.9379135038421763, "learning_rate": 9.995195214604515e-07, "loss": 1.941, "step": 430 }, { "epoch": 0.016129623519923753, "grad_norm": 0.9126909079244707, "learning_rate": 9.994939046979838e-07, "loss": 1.9684, "step": 440 }, { "epoch": 0.01649620587264929, "grad_norm": 0.8838022442791796, "learning_rate": 9.994676230199407e-07, "loss": 2.0389, "step": 450 }, { "epoch": 0.01686278822537483, "grad_norm": 0.8836839199930503, "learning_rate": 9.994406764613082e-07, "loss": 1.9666, "step": 460 }, { "epoch": 0.01722937057810037, "grad_norm": 1.0627568898996331, "learning_rate": 9.994130650579563e-07, "loss": 2.0156, "step": 470 }, { "epoch": 0.01759595293082591, "grad_norm": 0.9141641845780258, "learning_rate": 9.993847888466408e-07, "loss": 1.9649, "step": 480 }, { "epoch": 0.01796253528355145, "grad_norm": 0.9929808622960486, "learning_rate": 9.993558478650027e-07, "loss": 1.951, "step": 490 }, { "epoch": 0.01832911763627699, "grad_norm": 0.9649106649125109, "learning_rate": 9.993262421515677e-07, "loss": 2.0194, "step": 500 }, { "epoch": 0.01869569998900253, "grad_norm": 0.9646184299435382, "learning_rate": 9.992959717457456e-07, "loss": 2.0054, "step": 510 }, { "epoch": 0.01906228234172807, "grad_norm": 0.9754107205971403, "learning_rate": 9.992650366878326e-07, "loss": 1.9614, "step": 520 }, { "epoch": 0.01942886469445361, "grad_norm": 0.825876663123403, "learning_rate": 9.99233437019008e-07, "loss": 2.0141, "step": 530 }, { "epoch": 0.01979544704717915, "grad_norm": 0.9898145517539251, "learning_rate": 9.992011727813372e-07, "loss": 1.9788, "step": 540 }, { "epoch": 0.020162029399904687, "grad_norm": 1.1244188599069105, "learning_rate": 9.991682440177694e-07, "loss": 1.9034, "step": 550 }, { "epoch": 0.020528611752630228, "grad_norm": 1.1497344942569774, "learning_rate": 9.991346507721387e-07, "loss": 1.9211, "step": 560 }, { "epoch": 0.020895194105355768, "grad_norm": 0.9021316458842555, "learning_rate": 9.991003930891637e-07, "loss": 1.9182, "step": 570 }, { "epoch": 0.02126177645808131, "grad_norm": 0.8307709564470201, "learning_rate": 9.990654710144475e-07, "loss": 1.9272, "step": 580 }, { "epoch": 0.02162835881080685, "grad_norm": 0.8745951617052735, "learning_rate": 9.990298845944777e-07, "loss": 1.9499, "step": 590 }, { "epoch": 0.021994941163532386, "grad_norm": 0.8243921045085457, "learning_rate": 9.98993633876626e-07, "loss": 1.9221, "step": 600 }, { "epoch": 0.022361523516257927, "grad_norm": 0.9285168979863858, "learning_rate": 9.989567189091486e-07, "loss": 1.8804, "step": 610 }, { "epoch": 0.022728105868983467, "grad_norm": 0.9675998606348684, "learning_rate": 9.98919139741186e-07, "loss": 1.9019, "step": 620 }, { "epoch": 0.023094688221709007, "grad_norm": 0.8852104273861887, "learning_rate": 9.988808964227629e-07, "loss": 1.8772, "step": 630 }, { "epoch": 0.023461270574434548, "grad_norm": 0.819719680853091, "learning_rate": 9.988419890047877e-07, "loss": 1.9171, "step": 640 }, { "epoch": 0.023827852927160085, "grad_norm": 0.93140794342887, "learning_rate": 9.988024175390533e-07, "loss": 1.8467, "step": 650 }, { "epoch": 0.024194435279885625, "grad_norm": 0.8360802933834758, "learning_rate": 9.987621820782363e-07, "loss": 1.9233, "step": 660 }, { "epoch": 0.024561017632611166, "grad_norm": 0.8157180427592693, "learning_rate": 9.987212826758975e-07, "loss": 1.9473, "step": 670 }, { "epoch": 0.024927599985336706, "grad_norm": 0.9793002573948607, "learning_rate": 9.98679719386481e-07, "loss": 1.8931, "step": 680 }, { "epoch": 0.025294182338062247, "grad_norm": 0.8445420197840301, "learning_rate": 9.986374922653154e-07, "loss": 1.8686, "step": 690 }, { "epoch": 0.025660764690787787, "grad_norm": 0.8584605142905422, "learning_rate": 9.985946013686119e-07, "loss": 1.8967, "step": 700 }, { "epoch": 0.026027347043513324, "grad_norm": 0.98656156834715, "learning_rate": 9.985510467534664e-07, "loss": 1.8635, "step": 710 }, { "epoch": 0.026393929396238865, "grad_norm": 0.9182458113746159, "learning_rate": 9.985068284778577e-07, "loss": 1.8693, "step": 720 }, { "epoch": 0.026760511748964405, "grad_norm": 0.8330989668660308, "learning_rate": 9.984619466006485e-07, "loss": 1.8613, "step": 730 }, { "epoch": 0.027127094101689946, "grad_norm": 0.8644736624360776, "learning_rate": 9.98416401181584e-07, "loss": 1.8628, "step": 740 }, { "epoch": 0.027493676454415486, "grad_norm": 0.987168924150431, "learning_rate": 9.98370192281294e-07, "loss": 1.8943, "step": 750 }, { "epoch": 0.027860258807141023, "grad_norm": 0.8720418625775509, "learning_rate": 9.983233199612903e-07, "loss": 1.9446, "step": 760 }, { "epoch": 0.028226841159866563, "grad_norm": 0.7953663245922279, "learning_rate": 9.982757842839687e-07, "loss": 1.9014, "step": 770 }, { "epoch": 0.028593423512592104, "grad_norm": 0.9296681817326182, "learning_rate": 9.98227585312607e-07, "loss": 1.8108, "step": 780 }, { "epoch": 0.028960005865317644, "grad_norm": 0.8062000633701384, "learning_rate": 9.981787231113675e-07, "loss": 1.8345, "step": 790 }, { "epoch": 0.029326588218043185, "grad_norm": 0.7938194156111642, "learning_rate": 9.981291977452939e-07, "loss": 1.8941, "step": 800 }, { "epoch": 0.029693170570768722, "grad_norm": 0.9291321405470028, "learning_rate": 9.980790092803135e-07, "loss": 1.8403, "step": 810 }, { "epoch": 0.030059752923494262, "grad_norm": 0.8275423223500764, "learning_rate": 9.980281577832363e-07, "loss": 1.8402, "step": 820 }, { "epoch": 0.030426335276219803, "grad_norm": 0.8980283349268403, "learning_rate": 9.979766433217545e-07, "loss": 1.8691, "step": 830 }, { "epoch": 0.030792917628945343, "grad_norm": 0.7768796883189981, "learning_rate": 9.979244659644429e-07, "loss": 1.888, "step": 840 }, { "epoch": 0.031159499981670884, "grad_norm": 0.818398169635764, "learning_rate": 9.978716257807593e-07, "loss": 1.8814, "step": 850 }, { "epoch": 0.031526082334396424, "grad_norm": 0.8442121417280394, "learning_rate": 9.97818122841043e-07, "loss": 1.8369, "step": 860 }, { "epoch": 0.031892664687121965, "grad_norm": 0.8176757534156489, "learning_rate": 9.977639572165162e-07, "loss": 1.8591, "step": 870 }, { "epoch": 0.032259247039847505, "grad_norm": 0.8029579269470367, "learning_rate": 9.97709128979283e-07, "loss": 1.8866, "step": 880 }, { "epoch": 0.03262582939257304, "grad_norm": 0.8812915944662771, "learning_rate": 9.976536382023294e-07, "loss": 1.8366, "step": 890 }, { "epoch": 0.03299241174529858, "grad_norm": 0.777876054228082, "learning_rate": 9.97597484959524e-07, "loss": 1.8322, "step": 900 }, { "epoch": 0.03335899409802412, "grad_norm": 0.9073927568433396, "learning_rate": 9.975406693256162e-07, "loss": 1.8238, "step": 910 }, { "epoch": 0.03372557645074966, "grad_norm": 1.154230547383887, "learning_rate": 9.974831913762382e-07, "loss": 1.8574, "step": 920 }, { "epoch": 0.0340921588034752, "grad_norm": 0.8196714978615802, "learning_rate": 9.974250511879031e-07, "loss": 1.8423, "step": 930 }, { "epoch": 0.03445874115620074, "grad_norm": 0.9288752746341313, "learning_rate": 9.97366248838006e-07, "loss": 1.8993, "step": 940 }, { "epoch": 0.03482532350892628, "grad_norm": 0.7950657259868453, "learning_rate": 9.973067844048235e-07, "loss": 1.8741, "step": 950 }, { "epoch": 0.03519190586165182, "grad_norm": 0.796086365915343, "learning_rate": 9.972466579675131e-07, "loss": 1.7832, "step": 960 }, { "epoch": 0.03555848821437736, "grad_norm": 0.9066172708399791, "learning_rate": 9.97185869606114e-07, "loss": 1.8462, "step": 970 }, { "epoch": 0.0359250705671029, "grad_norm": 1.038083569499433, "learning_rate": 9.971244194015463e-07, "loss": 1.858, "step": 980 }, { "epoch": 0.036291652919828436, "grad_norm": 0.9051533251684815, "learning_rate": 9.97062307435611e-07, "loss": 1.8387, "step": 990 }, { "epoch": 0.03665823527255398, "grad_norm": 0.8381523935993735, "learning_rate": 9.969995337909908e-07, "loss": 1.8361, "step": 1000 }, { "epoch": 0.03665823527255398, "eval_accuracy": 0.5988169778677517, "eval_loss": 1.8318405151367188, "eval_runtime": 308.5555, "eval_samples_per_second": 10.718, "eval_steps_per_second": 0.894, "step": 1000 }, { "epoch": 0.03702481762527952, "grad_norm": 0.8427628207388767, "learning_rate": 9.969360985512478e-07, "loss": 1.8265, "step": 1010 }, { "epoch": 0.03739139997800506, "grad_norm": 0.8552215254960128, "learning_rate": 9.968720018008264e-07, "loss": 1.858, "step": 1020 }, { "epoch": 0.0377579823307306, "grad_norm": 0.9770990446912831, "learning_rate": 9.968072436250502e-07, "loss": 1.8336, "step": 1030 }, { "epoch": 0.03812456468345614, "grad_norm": 0.8749109462328284, "learning_rate": 9.967418241101245e-07, "loss": 1.8659, "step": 1040 }, { "epoch": 0.03849114703618168, "grad_norm": 1.0370092544039358, "learning_rate": 9.966757433431338e-07, "loss": 1.7817, "step": 1050 }, { "epoch": 0.03885772938890722, "grad_norm": 0.9115228378829131, "learning_rate": 9.966090014120439e-07, "loss": 1.8024, "step": 1060 }, { "epoch": 0.03922431174163276, "grad_norm": 0.8868427346212977, "learning_rate": 9.965415984056998e-07, "loss": 1.8437, "step": 1070 }, { "epoch": 0.0395908940943583, "grad_norm": 0.9053364161480404, "learning_rate": 9.96473534413827e-07, "loss": 1.817, "step": 1080 }, { "epoch": 0.039957476447083834, "grad_norm": 0.9133195528454671, "learning_rate": 9.964048095270312e-07, "loss": 1.7877, "step": 1090 }, { "epoch": 0.040324058799809374, "grad_norm": 1.0646101033232054, "learning_rate": 9.963354238367971e-07, "loss": 1.784, "step": 1100 }, { "epoch": 0.040690641152534915, "grad_norm": 0.7708104862115812, "learning_rate": 9.962653774354897e-07, "loss": 1.8534, "step": 1110 }, { "epoch": 0.041057223505260455, "grad_norm": 0.8675790148592712, "learning_rate": 9.96194670416353e-07, "loss": 1.8549, "step": 1120 }, { "epoch": 0.041423805857985996, "grad_norm": 0.8417668918121122, "learning_rate": 9.961233028735107e-07, "loss": 1.816, "step": 1130 }, { "epoch": 0.041790388210711536, "grad_norm": 0.8168288703880237, "learning_rate": 9.960512749019661e-07, "loss": 1.8512, "step": 1140 }, { "epoch": 0.04215697056343708, "grad_norm": 0.8018545416660454, "learning_rate": 9.95978586597601e-07, "loss": 1.832, "step": 1150 }, { "epoch": 0.04252355291616262, "grad_norm": 0.9865966895727584, "learning_rate": 9.959052380571764e-07, "loss": 1.853, "step": 1160 }, { "epoch": 0.04289013526888816, "grad_norm": 0.8107907928839149, "learning_rate": 9.958312293783327e-07, "loss": 1.85, "step": 1170 }, { "epoch": 0.0432567176216137, "grad_norm": 0.9230676080344427, "learning_rate": 9.957565606595882e-07, "loss": 1.7839, "step": 1180 }, { "epoch": 0.04362329997433924, "grad_norm": 0.9011134249108275, "learning_rate": 9.956812320003407e-07, "loss": 1.7649, "step": 1190 }, { "epoch": 0.04398988232706477, "grad_norm": 0.8877055310067349, "learning_rate": 9.956052435008657e-07, "loss": 1.8358, "step": 1200 }, { "epoch": 0.04435646467979031, "grad_norm": 0.9441745533847735, "learning_rate": 9.955285952623177e-07, "loss": 1.8217, "step": 1210 }, { "epoch": 0.04472304703251585, "grad_norm": 0.9280531244485228, "learning_rate": 9.954512873867292e-07, "loss": 1.8273, "step": 1220 }, { "epoch": 0.04508962938524139, "grad_norm": 1.0733510489183336, "learning_rate": 9.95373319977011e-07, "loss": 1.8289, "step": 1230 }, { "epoch": 0.045456211737966934, "grad_norm": 0.9194393203848475, "learning_rate": 9.952946931369512e-07, "loss": 1.8134, "step": 1240 }, { "epoch": 0.045822794090692474, "grad_norm": 0.8924651164337065, "learning_rate": 9.952154069712164e-07, "loss": 1.8233, "step": 1250 }, { "epoch": 0.046189376443418015, "grad_norm": 0.9645620934573451, "learning_rate": 9.951354615853506e-07, "loss": 1.7951, "step": 1260 }, { "epoch": 0.046555958796143555, "grad_norm": 0.9514951845878826, "learning_rate": 9.950548570857755e-07, "loss": 1.8034, "step": 1270 }, { "epoch": 0.046922541148869096, "grad_norm": 1.0861848487934576, "learning_rate": 9.949735935797898e-07, "loss": 1.7845, "step": 1280 }, { "epoch": 0.047289123501594636, "grad_norm": 0.9444165617124335, "learning_rate": 9.948916711755702e-07, "loss": 1.8499, "step": 1290 }, { "epoch": 0.04765570585432017, "grad_norm": 0.9296489213610688, "learning_rate": 9.948090899821695e-07, "loss": 1.8362, "step": 1300 }, { "epoch": 0.04802228820704571, "grad_norm": 0.9031404187157595, "learning_rate": 9.947258501095183e-07, "loss": 1.7987, "step": 1310 }, { "epoch": 0.04838887055977125, "grad_norm": 0.9893576898507132, "learning_rate": 9.946419516684238e-07, "loss": 1.7901, "step": 1320 }, { "epoch": 0.04875545291249679, "grad_norm": 0.8312432281714202, "learning_rate": 9.945573947705696e-07, "loss": 1.7877, "step": 1330 }, { "epoch": 0.04912203526522233, "grad_norm": 0.9503234488792208, "learning_rate": 9.944721795285161e-07, "loss": 1.7814, "step": 1340 }, { "epoch": 0.04948861761794787, "grad_norm": 0.8138144516056374, "learning_rate": 9.943863060557e-07, "loss": 1.7973, "step": 1350 }, { "epoch": 0.04985519997067341, "grad_norm": 1.0236050868655204, "learning_rate": 9.942997744664346e-07, "loss": 1.766, "step": 1360 }, { "epoch": 0.05022178232339895, "grad_norm": 0.8876253030811799, "learning_rate": 9.942125848759084e-07, "loss": 1.8025, "step": 1370 }, { "epoch": 0.05058836467612449, "grad_norm": 0.9143837255426513, "learning_rate": 9.941247374001864e-07, "loss": 1.8256, "step": 1380 }, { "epoch": 0.050954947028850034, "grad_norm": 0.7919956208916636, "learning_rate": 9.940362321562095e-07, "loss": 1.7966, "step": 1390 }, { "epoch": 0.051321529381575574, "grad_norm": 0.9593927463945575, "learning_rate": 9.939470692617936e-07, "loss": 1.756, "step": 1400 }, { "epoch": 0.05168811173430111, "grad_norm": 1.0264148022637987, "learning_rate": 9.938572488356309e-07, "loss": 1.7938, "step": 1410 }, { "epoch": 0.05205469408702665, "grad_norm": 1.0694910008156386, "learning_rate": 9.937667709972882e-07, "loss": 1.7151, "step": 1420 }, { "epoch": 0.05242127643975219, "grad_norm": 1.106949179035861, "learning_rate": 9.936756358672075e-07, "loss": 1.7566, "step": 1430 }, { "epoch": 0.05278785879247773, "grad_norm": 0.8484995009187619, "learning_rate": 9.935838435667062e-07, "loss": 1.8061, "step": 1440 }, { "epoch": 0.05315444114520327, "grad_norm": 0.9442924790988804, "learning_rate": 9.93491394217976e-07, "loss": 1.7938, "step": 1450 }, { "epoch": 0.05352102349792881, "grad_norm": 0.8835040984395444, "learning_rate": 9.933982879440838e-07, "loss": 1.7801, "step": 1460 }, { "epoch": 0.05388760585065435, "grad_norm": 0.951681021528121, "learning_rate": 9.933045248689704e-07, "loss": 1.7839, "step": 1470 }, { "epoch": 0.05425418820337989, "grad_norm": 0.8986214443009446, "learning_rate": 9.932101051174513e-07, "loss": 1.8251, "step": 1480 }, { "epoch": 0.05462077055610543, "grad_norm": 0.8136477078651573, "learning_rate": 9.93115028815216e-07, "loss": 1.8429, "step": 1490 }, { "epoch": 0.05498735290883097, "grad_norm": 1.0031260237221131, "learning_rate": 9.93019296088828e-07, "loss": 1.7663, "step": 1500 }, { "epoch": 0.055353935261556506, "grad_norm": 0.9959012828848206, "learning_rate": 9.92922907065725e-07, "loss": 1.8269, "step": 1510 }, { "epoch": 0.055720517614282046, "grad_norm": 0.8915575658825868, "learning_rate": 9.928258618742176e-07, "loss": 1.7696, "step": 1520 }, { "epoch": 0.056087099967007586, "grad_norm": 0.9963782636445598, "learning_rate": 9.927281606434902e-07, "loss": 1.7738, "step": 1530 }, { "epoch": 0.05645368231973313, "grad_norm": 0.9381564546633785, "learning_rate": 9.92629803503601e-07, "loss": 1.7333, "step": 1540 }, { "epoch": 0.05682026467245867, "grad_norm": 1.0017202007335113, "learning_rate": 9.925307905854807e-07, "loss": 1.8095, "step": 1550 }, { "epoch": 0.05718684702518421, "grad_norm": 1.0543725728983615, "learning_rate": 9.924311220209332e-07, "loss": 1.7571, "step": 1560 }, { "epoch": 0.05755342937790975, "grad_norm": 1.0455383232236297, "learning_rate": 9.92330797942635e-07, "loss": 1.7605, "step": 1570 }, { "epoch": 0.05792001173063529, "grad_norm": 0.8416991518569622, "learning_rate": 9.922298184841356e-07, "loss": 1.7703, "step": 1580 }, { "epoch": 0.05828659408336083, "grad_norm": 0.92044213042727, "learning_rate": 9.921281837798565e-07, "loss": 1.7051, "step": 1590 }, { "epoch": 0.05865317643608637, "grad_norm": 0.9422384532621354, "learning_rate": 9.920258939650918e-07, "loss": 1.7882, "step": 1600 }, { "epoch": 0.0590197587888119, "grad_norm": 1.1464397608985724, "learning_rate": 9.919229491760074e-07, "loss": 1.7504, "step": 1610 }, { "epoch": 0.059386341141537444, "grad_norm": 1.1503410560007548, "learning_rate": 9.918193495496411e-07, "loss": 1.7755, "step": 1620 }, { "epoch": 0.059752923494262984, "grad_norm": 1.034854775422536, "learning_rate": 9.917150952239028e-07, "loss": 1.8109, "step": 1630 }, { "epoch": 0.060119505846988525, "grad_norm": 0.9357240877838402, "learning_rate": 9.916101863375734e-07, "loss": 1.812, "step": 1640 }, { "epoch": 0.060486088199714065, "grad_norm": 1.2613406348730127, "learning_rate": 9.915046230303055e-07, "loss": 1.7299, "step": 1650 }, { "epoch": 0.060852670552439606, "grad_norm": 0.991269818479319, "learning_rate": 9.913984054426226e-07, "loss": 1.6839, "step": 1660 }, { "epoch": 0.061219252905165146, "grad_norm": 1.0426302229265827, "learning_rate": 9.91291533715919e-07, "loss": 1.6983, "step": 1670 }, { "epoch": 0.061585835257890686, "grad_norm": 1.0623577818006307, "learning_rate": 9.911840079924607e-07, "loss": 1.7586, "step": 1680 }, { "epoch": 0.06195241761061623, "grad_norm": 0.9792793493189645, "learning_rate": 9.910758284153834e-07, "loss": 1.7863, "step": 1690 }, { "epoch": 0.06231899996334177, "grad_norm": 1.1013133546227525, "learning_rate": 9.90966995128693e-07, "loss": 1.7586, "step": 1700 }, { "epoch": 0.0626855823160673, "grad_norm": 1.2653001609685381, "learning_rate": 9.908575082772664e-07, "loss": 1.7087, "step": 1710 }, { "epoch": 0.06305216466879285, "grad_norm": 1.2600949114865185, "learning_rate": 9.907473680068501e-07, "loss": 1.6974, "step": 1720 }, { "epoch": 0.06341874702151838, "grad_norm": 1.0352843166386823, "learning_rate": 9.906365744640605e-07, "loss": 1.7247, "step": 1730 }, { "epoch": 0.06378532937424393, "grad_norm": 1.0534586823177523, "learning_rate": 9.905251277963838e-07, "loss": 1.7989, "step": 1740 }, { "epoch": 0.06415191172696946, "grad_norm": 1.0901888662447625, "learning_rate": 9.904130281521749e-07, "loss": 1.7495, "step": 1750 }, { "epoch": 0.06451849407969501, "grad_norm": 1.0657237836075932, "learning_rate": 9.903002756806589e-07, "loss": 1.7393, "step": 1760 }, { "epoch": 0.06488507643242054, "grad_norm": 1.0695629454280169, "learning_rate": 9.901868705319291e-07, "loss": 1.784, "step": 1770 }, { "epoch": 0.06525165878514608, "grad_norm": 0.9206279700392275, "learning_rate": 9.900728128569482e-07, "loss": 1.758, "step": 1780 }, { "epoch": 0.06561824113787162, "grad_norm": 1.0410164391482535, "learning_rate": 9.899581028075473e-07, "loss": 1.7252, "step": 1790 }, { "epoch": 0.06598482349059716, "grad_norm": 0.9377493357256449, "learning_rate": 9.898427405364262e-07, "loss": 1.74, "step": 1800 }, { "epoch": 0.0663514058433227, "grad_norm": 1.1272971880737597, "learning_rate": 9.897267261971524e-07, "loss": 1.7524, "step": 1810 }, { "epoch": 0.06671798819604824, "grad_norm": 1.0979559562270786, "learning_rate": 9.896100599441618e-07, "loss": 1.6988, "step": 1820 }, { "epoch": 0.06708457054877379, "grad_norm": 0.961855276743755, "learning_rate": 9.894927419327576e-07, "loss": 1.7327, "step": 1830 }, { "epoch": 0.06745115290149932, "grad_norm": 0.97235897562474, "learning_rate": 9.893747723191118e-07, "loss": 1.7544, "step": 1840 }, { "epoch": 0.06781773525422487, "grad_norm": 1.1764451813427488, "learning_rate": 9.892561512602626e-07, "loss": 1.7616, "step": 1850 }, { "epoch": 0.0681843176069504, "grad_norm": 0.9690232157285822, "learning_rate": 9.891368789141158e-07, "loss": 1.7386, "step": 1860 }, { "epoch": 0.06855089995967593, "grad_norm": 1.131145797735988, "learning_rate": 9.89016955439444e-07, "loss": 1.7473, "step": 1870 }, { "epoch": 0.06891748231240148, "grad_norm": 1.1996910697441496, "learning_rate": 9.88896380995887e-07, "loss": 1.7502, "step": 1880 }, { "epoch": 0.06928406466512702, "grad_norm": 1.2280647210603344, "learning_rate": 9.887751557439513e-07, "loss": 1.7547, "step": 1890 }, { "epoch": 0.06965064701785256, "grad_norm": 1.0705375351848956, "learning_rate": 9.886532798450085e-07, "loss": 1.7577, "step": 1900 }, { "epoch": 0.0700172293705781, "grad_norm": 1.0083918166967278, "learning_rate": 9.88530753461298e-07, "loss": 1.7193, "step": 1910 }, { "epoch": 0.07038381172330364, "grad_norm": 1.0053388433251793, "learning_rate": 9.884075767559236e-07, "loss": 1.7635, "step": 1920 }, { "epoch": 0.07075039407602918, "grad_norm": 1.1405257537860627, "learning_rate": 9.88283749892856e-07, "loss": 1.7859, "step": 1930 }, { "epoch": 0.07111697642875472, "grad_norm": 1.3872222978621402, "learning_rate": 9.881592730369305e-07, "loss": 1.6823, "step": 1940 }, { "epoch": 0.07148355878148026, "grad_norm": 1.0500974949147595, "learning_rate": 9.880341463538483e-07, "loss": 1.7268, "step": 1950 }, { "epoch": 0.0718501411342058, "grad_norm": 1.1146107157958263, "learning_rate": 9.879083700101754e-07, "loss": 1.7324, "step": 1960 }, { "epoch": 0.07221672348693134, "grad_norm": 1.0782444093138666, "learning_rate": 9.877819441733421e-07, "loss": 1.7219, "step": 1970 }, { "epoch": 0.07258330583965687, "grad_norm": 1.1066515564824118, "learning_rate": 9.876548690116443e-07, "loss": 1.6974, "step": 1980 }, { "epoch": 0.07294988819238242, "grad_norm": 1.0551270004207765, "learning_rate": 9.875271446942416e-07, "loss": 1.7086, "step": 1990 }, { "epoch": 0.07331647054510795, "grad_norm": 1.0172022580059552, "learning_rate": 9.873987713911579e-07, "loss": 1.7281, "step": 2000 }, { "epoch": 0.07331647054510795, "eval_accuracy": 0.6153943652920695, "eval_loss": 1.7325148582458496, "eval_runtime": 307.9034, "eval_samples_per_second": 10.74, "eval_steps_per_second": 0.896, "step": 2000 }, { "epoch": 0.0736830528978335, "grad_norm": 1.0319650415221862, "learning_rate": 9.872697492732805e-07, "loss": 1.699, "step": 2010 }, { "epoch": 0.07404963525055903, "grad_norm": 0.9982774529316707, "learning_rate": 9.871400785123615e-07, "loss": 1.7476, "step": 2020 }, { "epoch": 0.07441621760328458, "grad_norm": 1.1272779709424325, "learning_rate": 9.870097592810156e-07, "loss": 1.7911, "step": 2030 }, { "epoch": 0.07478279995601012, "grad_norm": 1.0356947186293473, "learning_rate": 9.86878791752721e-07, "loss": 1.7038, "step": 2040 }, { "epoch": 0.07514938230873566, "grad_norm": 0.9227271241300935, "learning_rate": 9.867471761018187e-07, "loss": 1.789, "step": 2050 }, { "epoch": 0.0755159646614612, "grad_norm": 1.1484518524699514, "learning_rate": 9.86614912503513e-07, "loss": 1.7706, "step": 2060 }, { "epoch": 0.07588254701418674, "grad_norm": 0.8955923870076745, "learning_rate": 9.864820011338698e-07, "loss": 1.7543, "step": 2070 }, { "epoch": 0.07624912936691228, "grad_norm": 1.1335067807492596, "learning_rate": 9.863484421698182e-07, "loss": 1.7155, "step": 2080 }, { "epoch": 0.07661571171963781, "grad_norm": 1.1784649675887455, "learning_rate": 9.86214235789149e-07, "loss": 1.7198, "step": 2090 }, { "epoch": 0.07698229407236336, "grad_norm": 0.9990776315852751, "learning_rate": 9.860793821705153e-07, "loss": 1.7088, "step": 2100 }, { "epoch": 0.07734887642508889, "grad_norm": 1.8933737366748618, "learning_rate": 9.859438814934306e-07, "loss": 1.7815, "step": 2110 }, { "epoch": 0.07771545877781444, "grad_norm": 1.0824373033670114, "learning_rate": 9.858077339382708e-07, "loss": 1.7056, "step": 2120 }, { "epoch": 0.07808204113053997, "grad_norm": 1.0459040499217758, "learning_rate": 9.856709396862727e-07, "loss": 1.7587, "step": 2130 }, { "epoch": 0.07844862348326552, "grad_norm": 1.1273027866420589, "learning_rate": 9.855334989195338e-07, "loss": 1.6718, "step": 2140 }, { "epoch": 0.07881520583599105, "grad_norm": 1.1216307142085522, "learning_rate": 9.853954118210124e-07, "loss": 1.6925, "step": 2150 }, { "epoch": 0.0791817881887166, "grad_norm": 1.2320479842440668, "learning_rate": 9.852566785745269e-07, "loss": 1.7128, "step": 2160 }, { "epoch": 0.07954837054144213, "grad_norm": 1.0679388999130817, "learning_rate": 9.851172993647562e-07, "loss": 1.7063, "step": 2170 }, { "epoch": 0.07991495289416767, "grad_norm": 1.2733808120999472, "learning_rate": 9.849772743772387e-07, "loss": 1.69, "step": 2180 }, { "epoch": 0.08028153524689322, "grad_norm": 1.240045987921097, "learning_rate": 9.848366037983728e-07, "loss": 1.7382, "step": 2190 }, { "epoch": 0.08064811759961875, "grad_norm": 1.0370629833579919, "learning_rate": 9.846952878154162e-07, "loss": 1.7135, "step": 2200 }, { "epoch": 0.0810146999523443, "grad_norm": 1.1809158590474762, "learning_rate": 9.845533266164856e-07, "loss": 1.7197, "step": 2210 }, { "epoch": 0.08138128230506983, "grad_norm": 1.0143562772242192, "learning_rate": 9.844107203905567e-07, "loss": 1.7062, "step": 2220 }, { "epoch": 0.08174786465779538, "grad_norm": 1.1841441026483928, "learning_rate": 9.842674693274639e-07, "loss": 1.6766, "step": 2230 }, { "epoch": 0.08211444701052091, "grad_norm": 1.1281564379658906, "learning_rate": 9.841235736179e-07, "loss": 1.6485, "step": 2240 }, { "epoch": 0.08248102936324646, "grad_norm": 1.2660731034162191, "learning_rate": 9.83979033453416e-07, "loss": 1.7513, "step": 2250 }, { "epoch": 0.08284761171597199, "grad_norm": 1.1670722746985231, "learning_rate": 9.8383384902642e-07, "loss": 1.7282, "step": 2260 }, { "epoch": 0.08321419406869754, "grad_norm": 1.1924698170354644, "learning_rate": 9.836880205301795e-07, "loss": 1.7339, "step": 2270 }, { "epoch": 0.08358077642142307, "grad_norm": 1.0522491790203259, "learning_rate": 9.835415481588173e-07, "loss": 1.6907, "step": 2280 }, { "epoch": 0.0839473587741486, "grad_norm": 1.1650865835189006, "learning_rate": 9.83394432107315e-07, "loss": 1.718, "step": 2290 }, { "epoch": 0.08431394112687415, "grad_norm": 0.9881537861019963, "learning_rate": 9.832466725715097e-07, "loss": 1.7423, "step": 2300 }, { "epoch": 0.08468052347959969, "grad_norm": 1.0843420992658444, "learning_rate": 9.830982697480958e-07, "loss": 1.7112, "step": 2310 }, { "epoch": 0.08504710583232523, "grad_norm": 1.1947303847486304, "learning_rate": 9.829492238346244e-07, "loss": 1.6813, "step": 2320 }, { "epoch": 0.08541368818505077, "grad_norm": 1.04336555772043, "learning_rate": 9.82799535029502e-07, "loss": 1.6871, "step": 2330 }, { "epoch": 0.08578027053777632, "grad_norm": 1.3465243494238373, "learning_rate": 9.826492035319911e-07, "loss": 1.7358, "step": 2340 }, { "epoch": 0.08614685289050185, "grad_norm": 1.1173189734449491, "learning_rate": 9.824982295422097e-07, "loss": 1.7047, "step": 2350 }, { "epoch": 0.0865134352432274, "grad_norm": 1.2520018391632697, "learning_rate": 9.823466132611313e-07, "loss": 1.6984, "step": 2360 }, { "epoch": 0.08688001759595293, "grad_norm": 1.03470369404529, "learning_rate": 9.82194354890584e-07, "loss": 1.7278, "step": 2370 }, { "epoch": 0.08724659994867848, "grad_norm": 1.0164204083388344, "learning_rate": 9.820414546332513e-07, "loss": 1.7458, "step": 2380 }, { "epoch": 0.08761318230140401, "grad_norm": 1.2348821126024987, "learning_rate": 9.818879126926701e-07, "loss": 1.7343, "step": 2390 }, { "epoch": 0.08797976465412954, "grad_norm": 1.0011105767660962, "learning_rate": 9.817337292732328e-07, "loss": 1.7131, "step": 2400 }, { "epoch": 0.08834634700685509, "grad_norm": 1.0710762717577924, "learning_rate": 9.815789045801847e-07, "loss": 1.6617, "step": 2410 }, { "epoch": 0.08871292935958063, "grad_norm": 1.1055970569118785, "learning_rate": 9.814234388196252e-07, "loss": 1.758, "step": 2420 }, { "epoch": 0.08907951171230617, "grad_norm": 1.013594052614807, "learning_rate": 9.81267332198507e-07, "loss": 1.6906, "step": 2430 }, { "epoch": 0.0894460940650317, "grad_norm": 1.0649424099545044, "learning_rate": 9.811105849246359e-07, "loss": 1.6896, "step": 2440 }, { "epoch": 0.08981267641775725, "grad_norm": 1.7084885584877294, "learning_rate": 9.809531972066705e-07, "loss": 1.6614, "step": 2450 }, { "epoch": 0.09017925877048279, "grad_norm": 1.5758236147361129, "learning_rate": 9.807951692541217e-07, "loss": 1.6952, "step": 2460 }, { "epoch": 0.09054584112320833, "grad_norm": 1.3585874981966901, "learning_rate": 9.806365012773532e-07, "loss": 1.7113, "step": 2470 }, { "epoch": 0.09091242347593387, "grad_norm": 1.3061869321513975, "learning_rate": 9.804771934875807e-07, "loss": 1.6796, "step": 2480 }, { "epoch": 0.09127900582865942, "grad_norm": 1.1540286110201206, "learning_rate": 9.803172460968705e-07, "loss": 1.7097, "step": 2490 }, { "epoch": 0.09164558818138495, "grad_norm": 1.2915686809771951, "learning_rate": 9.80156659318142e-07, "loss": 1.7138, "step": 2500 }, { "epoch": 0.09201217053411048, "grad_norm": 1.1468908768097306, "learning_rate": 9.799954333651642e-07, "loss": 1.7038, "step": 2510 }, { "epoch": 0.09237875288683603, "grad_norm": 1.257655656482852, "learning_rate": 9.79833568452558e-07, "loss": 1.677, "step": 2520 }, { "epoch": 0.09274533523956156, "grad_norm": 1.6361492549326027, "learning_rate": 9.796710647957944e-07, "loss": 1.6155, "step": 2530 }, { "epoch": 0.09311191759228711, "grad_norm": 1.1505717408841072, "learning_rate": 9.795079226111949e-07, "loss": 1.6811, "step": 2540 }, { "epoch": 0.09347849994501264, "grad_norm": 1.1983166183129195, "learning_rate": 9.793441421159308e-07, "loss": 1.7203, "step": 2550 }, { "epoch": 0.09384508229773819, "grad_norm": 1.1985818933727272, "learning_rate": 9.79179723528023e-07, "loss": 1.7232, "step": 2560 }, { "epoch": 0.09421166465046373, "grad_norm": 1.0143700528752713, "learning_rate": 9.790146670663422e-07, "loss": 1.6916, "step": 2570 }, { "epoch": 0.09457824700318927, "grad_norm": 1.121117592417204, "learning_rate": 9.788489729506082e-07, "loss": 1.6683, "step": 2580 }, { "epoch": 0.0949448293559148, "grad_norm": 1.339002521581536, "learning_rate": 9.78682641401389e-07, "loss": 1.6622, "step": 2590 }, { "epoch": 0.09531141170864034, "grad_norm": 1.1212646774920143, "learning_rate": 9.785156726401019e-07, "loss": 1.687, "step": 2600 }, { "epoch": 0.09567799406136589, "grad_norm": 1.2061879994547406, "learning_rate": 9.78348066889012e-07, "loss": 1.6652, "step": 2610 }, { "epoch": 0.09604457641409142, "grad_norm": 1.225185884537581, "learning_rate": 9.781798243712326e-07, "loss": 1.6948, "step": 2620 }, { "epoch": 0.09641115876681697, "grad_norm": 1.0146497215382635, "learning_rate": 9.780109453107245e-07, "loss": 1.7009, "step": 2630 }, { "epoch": 0.0967777411195425, "grad_norm": 1.2171300466801498, "learning_rate": 9.77841429932296e-07, "loss": 1.7087, "step": 2640 }, { "epoch": 0.09714432347226805, "grad_norm": 1.0629828650910798, "learning_rate": 9.77671278461602e-07, "loss": 1.7316, "step": 2650 }, { "epoch": 0.09751090582499358, "grad_norm": 1.1754432625786018, "learning_rate": 9.775004911251448e-07, "loss": 1.6953, "step": 2660 }, { "epoch": 0.09787748817771913, "grad_norm": 1.3069724342535498, "learning_rate": 9.773290681502727e-07, "loss": 1.7057, "step": 2670 }, { "epoch": 0.09824407053044466, "grad_norm": 1.3314679455466842, "learning_rate": 9.7715700976518e-07, "loss": 1.6842, "step": 2680 }, { "epoch": 0.09861065288317021, "grad_norm": 1.3928937247531508, "learning_rate": 9.769843161989079e-07, "loss": 1.7052, "step": 2690 }, { "epoch": 0.09897723523589574, "grad_norm": 1.3389115391442472, "learning_rate": 9.768109876813417e-07, "loss": 1.6905, "step": 2700 }, { "epoch": 0.09934381758862128, "grad_norm": 1.2854315608533564, "learning_rate": 9.76637024443213e-07, "loss": 1.6806, "step": 2710 }, { "epoch": 0.09971039994134683, "grad_norm": 1.24293956575573, "learning_rate": 9.764624267160975e-07, "loss": 1.6922, "step": 2720 }, { "epoch": 0.10007698229407236, "grad_norm": 1.2809307536658918, "learning_rate": 9.762871947324165e-07, "loss": 1.7001, "step": 2730 }, { "epoch": 0.1004435646467979, "grad_norm": 1.1615070632030087, "learning_rate": 9.761113287254345e-07, "loss": 1.6747, "step": 2740 }, { "epoch": 0.10081014699952344, "grad_norm": 1.245140216818738, "learning_rate": 9.75934828929261e-07, "loss": 1.6469, "step": 2750 }, { "epoch": 0.10117672935224899, "grad_norm": 1.152316966014997, "learning_rate": 9.757576955788486e-07, "loss": 1.6773, "step": 2760 }, { "epoch": 0.10154331170497452, "grad_norm": 1.1064605629765938, "learning_rate": 9.755799289099932e-07, "loss": 1.6447, "step": 2770 }, { "epoch": 0.10190989405770007, "grad_norm": 1.1150499110452152, "learning_rate": 9.754015291593343e-07, "loss": 1.7168, "step": 2780 }, { "epoch": 0.1022764764104256, "grad_norm": 1.3016769905995789, "learning_rate": 9.752224965643536e-07, "loss": 1.7209, "step": 2790 }, { "epoch": 0.10264305876315115, "grad_norm": 1.332321427009131, "learning_rate": 9.750428313633757e-07, "loss": 1.6247, "step": 2800 }, { "epoch": 0.10300964111587668, "grad_norm": 1.311092146207188, "learning_rate": 9.748625337955667e-07, "loss": 1.6366, "step": 2810 }, { "epoch": 0.10337622346860222, "grad_norm": 1.1634742047900515, "learning_rate": 9.746816041009351e-07, "loss": 1.7143, "step": 2820 }, { "epoch": 0.10374280582132776, "grad_norm": 1.1916284602740692, "learning_rate": 9.745000425203307e-07, "loss": 1.6568, "step": 2830 }, { "epoch": 0.1041093881740533, "grad_norm": 1.280571751055567, "learning_rate": 9.743178492954442e-07, "loss": 1.6303, "step": 2840 }, { "epoch": 0.10447597052677884, "grad_norm": 1.3621017517970784, "learning_rate": 9.741350246688076e-07, "loss": 1.7569, "step": 2850 }, { "epoch": 0.10484255287950438, "grad_norm": 1.1019913075705825, "learning_rate": 9.739515688837927e-07, "loss": 1.6934, "step": 2860 }, { "epoch": 0.10520913523222993, "grad_norm": 1.3868159647800968, "learning_rate": 9.73767482184612e-07, "loss": 1.6267, "step": 2870 }, { "epoch": 0.10557571758495546, "grad_norm": 1.4881189853618986, "learning_rate": 9.73582764816318e-07, "loss": 1.7354, "step": 2880 }, { "epoch": 0.105942299937681, "grad_norm": 1.5118948532986631, "learning_rate": 9.733974170248025e-07, "loss": 1.6856, "step": 2890 }, { "epoch": 0.10630888229040654, "grad_norm": 1.4796154680218983, "learning_rate": 9.732114390567963e-07, "loss": 1.7045, "step": 2900 }, { "epoch": 0.10667546464313207, "grad_norm": 1.2560441338500297, "learning_rate": 9.730248311598694e-07, "loss": 1.6466, "step": 2910 }, { "epoch": 0.10704204699585762, "grad_norm": 1.1595828068992133, "learning_rate": 9.728375935824301e-07, "loss": 1.6822, "step": 2920 }, { "epoch": 0.10740862934858315, "grad_norm": 1.3126146065763922, "learning_rate": 9.726497265737252e-07, "loss": 1.6723, "step": 2930 }, { "epoch": 0.1077752117013087, "grad_norm": 1.2296488317137073, "learning_rate": 9.724612303838393e-07, "loss": 1.6647, "step": 2940 }, { "epoch": 0.10814179405403423, "grad_norm": 1.170972623285309, "learning_rate": 9.722721052636944e-07, "loss": 1.6955, "step": 2950 }, { "epoch": 0.10850837640675978, "grad_norm": 1.2633141406462256, "learning_rate": 9.720823514650495e-07, "loss": 1.6332, "step": 2960 }, { "epoch": 0.10887495875948532, "grad_norm": 1.2911934178837097, "learning_rate": 9.718919692405014e-07, "loss": 1.7218, "step": 2970 }, { "epoch": 0.10924154111221086, "grad_norm": 1.1657180939495957, "learning_rate": 9.717009588434822e-07, "loss": 1.6067, "step": 2980 }, { "epoch": 0.1096081234649364, "grad_norm": 1.239214562886889, "learning_rate": 9.715093205282615e-07, "loss": 1.7067, "step": 2990 }, { "epoch": 0.10997470581766194, "grad_norm": 1.3619661984646028, "learning_rate": 9.713170545499435e-07, "loss": 1.6978, "step": 3000 }, { "epoch": 0.10997470581766194, "eval_accuracy": 0.6262376782115725, "eval_loss": 1.6762739419937134, "eval_runtime": 309.1255, "eval_samples_per_second": 10.698, "eval_steps_per_second": 0.893, "step": 3000 }, { "epoch": 0.11034128817038748, "grad_norm": 1.2670499181513593, "learning_rate": 9.711241611644688e-07, "loss": 1.677, "step": 3010 }, { "epoch": 0.11070787052311301, "grad_norm": 1.2403940254412753, "learning_rate": 9.709306406286129e-07, "loss": 1.6604, "step": 3020 }, { "epoch": 0.11107445287583856, "grad_norm": 1.3312898520587448, "learning_rate": 9.707364931999864e-07, "loss": 1.6867, "step": 3030 }, { "epoch": 0.11144103522856409, "grad_norm": 1.3495930407749666, "learning_rate": 9.70541719137034e-07, "loss": 1.6617, "step": 3040 }, { "epoch": 0.11180761758128964, "grad_norm": 1.1396532709110236, "learning_rate": 9.703463186990346e-07, "loss": 1.7035, "step": 3050 }, { "epoch": 0.11217419993401517, "grad_norm": 1.2231802562577823, "learning_rate": 9.701502921461013e-07, "loss": 1.6723, "step": 3060 }, { "epoch": 0.11254078228674072, "grad_norm": 1.3403523967021675, "learning_rate": 9.699536397391806e-07, "loss": 1.6698, "step": 3070 }, { "epoch": 0.11290736463946625, "grad_norm": 1.3447918453958256, "learning_rate": 9.697563617400516e-07, "loss": 1.6716, "step": 3080 }, { "epoch": 0.1132739469921918, "grad_norm": 1.2969348535087712, "learning_rate": 9.695584584113267e-07, "loss": 1.6949, "step": 3090 }, { "epoch": 0.11364052934491733, "grad_norm": 1.1643584556065927, "learning_rate": 9.693599300164508e-07, "loss": 1.6713, "step": 3100 }, { "epoch": 0.11400711169764288, "grad_norm": 1.2242377804664155, "learning_rate": 9.691607768197002e-07, "loss": 1.6386, "step": 3110 }, { "epoch": 0.11437369405036842, "grad_norm": 1.319822492671326, "learning_rate": 9.689609990861837e-07, "loss": 1.6816, "step": 3120 }, { "epoch": 0.11474027640309395, "grad_norm": 1.3781452196212938, "learning_rate": 9.687605970818408e-07, "loss": 1.6784, "step": 3130 }, { "epoch": 0.1151068587558195, "grad_norm": 1.2168088100404522, "learning_rate": 9.68559571073443e-07, "loss": 1.6982, "step": 3140 }, { "epoch": 0.11547344110854503, "grad_norm": 1.4540401524570652, "learning_rate": 9.68357921328591e-07, "loss": 1.6718, "step": 3150 }, { "epoch": 0.11584002346127058, "grad_norm": 1.3143498063269197, "learning_rate": 9.681556481157171e-07, "loss": 1.6709, "step": 3160 }, { "epoch": 0.11620660581399611, "grad_norm": 1.1946622719420839, "learning_rate": 9.679527517040831e-07, "loss": 1.6747, "step": 3170 }, { "epoch": 0.11657318816672166, "grad_norm": 1.286257203814063, "learning_rate": 9.6774923236378e-07, "loss": 1.699, "step": 3180 }, { "epoch": 0.11693977051944719, "grad_norm": 1.3969179686751765, "learning_rate": 9.675450903657286e-07, "loss": 1.6228, "step": 3190 }, { "epoch": 0.11730635287217274, "grad_norm": 1.1607892230894732, "learning_rate": 9.673403259816787e-07, "loss": 1.6538, "step": 3200 }, { "epoch": 0.11767293522489827, "grad_norm": 1.4009629932701972, "learning_rate": 9.671349394842075e-07, "loss": 1.6401, "step": 3210 }, { "epoch": 0.1180395175776238, "grad_norm": 1.5024706182569632, "learning_rate": 9.669289311467216e-07, "loss": 1.6508, "step": 3220 }, { "epoch": 0.11840609993034935, "grad_norm": 1.9466998313668968, "learning_rate": 9.66722301243455e-07, "loss": 1.6662, "step": 3230 }, { "epoch": 0.11877268228307489, "grad_norm": 1.6928758946763174, "learning_rate": 9.665150500494686e-07, "loss": 1.681, "step": 3240 }, { "epoch": 0.11913926463580043, "grad_norm": 1.5050927792757436, "learning_rate": 9.66307177840651e-07, "loss": 1.6669, "step": 3250 }, { "epoch": 0.11950584698852597, "grad_norm": 1.179067981511082, "learning_rate": 9.66098684893717e-07, "loss": 1.6503, "step": 3260 }, { "epoch": 0.11987242934125152, "grad_norm": 1.7279906281142485, "learning_rate": 9.658895714862082e-07, "loss": 1.6331, "step": 3270 }, { "epoch": 0.12023901169397705, "grad_norm": 1.1891919657193728, "learning_rate": 9.656798378964918e-07, "loss": 1.6111, "step": 3280 }, { "epoch": 0.1206055940467026, "grad_norm": 1.7749941957068498, "learning_rate": 9.654694844037607e-07, "loss": 1.666, "step": 3290 }, { "epoch": 0.12097217639942813, "grad_norm": 1.5093366351881725, "learning_rate": 9.65258511288033e-07, "loss": 1.6569, "step": 3300 }, { "epoch": 0.12133875875215368, "grad_norm": 1.2872309950824516, "learning_rate": 9.650469188301512e-07, "loss": 1.6697, "step": 3310 }, { "epoch": 0.12170534110487921, "grad_norm": 1.2299002535631731, "learning_rate": 9.648347073117832e-07, "loss": 1.6413, "step": 3320 }, { "epoch": 0.12207192345760474, "grad_norm": 1.407253463937065, "learning_rate": 9.6462187701542e-07, "loss": 1.6757, "step": 3330 }, { "epoch": 0.12243850581033029, "grad_norm": 1.166071729039829, "learning_rate": 9.644084282243768e-07, "loss": 1.6654, "step": 3340 }, { "epoch": 0.12280508816305583, "grad_norm": 1.558952263125209, "learning_rate": 9.641943612227921e-07, "loss": 1.6807, "step": 3350 }, { "epoch": 0.12317167051578137, "grad_norm": 1.3374281457093373, "learning_rate": 9.639796762956276e-07, "loss": 1.6664, "step": 3360 }, { "epoch": 0.1235382528685069, "grad_norm": 1.1902844247942133, "learning_rate": 9.637643737286667e-07, "loss": 1.6914, "step": 3370 }, { "epoch": 0.12390483522123245, "grad_norm": 1.2998133772041194, "learning_rate": 9.63548453808516e-07, "loss": 1.7112, "step": 3380 }, { "epoch": 0.12427141757395799, "grad_norm": 1.3162405748836254, "learning_rate": 9.633319168226036e-07, "loss": 1.6936, "step": 3390 }, { "epoch": 0.12463799992668353, "grad_norm": 1.3677758198871173, "learning_rate": 9.631147630591782e-07, "loss": 1.6883, "step": 3400 }, { "epoch": 0.12500458227940908, "grad_norm": 1.2054292111865461, "learning_rate": 9.62896992807311e-07, "loss": 1.6576, "step": 3410 }, { "epoch": 0.1253711646321346, "grad_norm": 1.156101638091166, "learning_rate": 9.626786063568925e-07, "loss": 1.6667, "step": 3420 }, { "epoch": 0.12573774698486015, "grad_norm": 1.3745543808654352, "learning_rate": 9.624596039986343e-07, "loss": 1.6712, "step": 3430 }, { "epoch": 0.1261043293375857, "grad_norm": 1.178401890967186, "learning_rate": 9.622399860240679e-07, "loss": 1.6474, "step": 3440 }, { "epoch": 0.12647091169031122, "grad_norm": 1.4332376083467566, "learning_rate": 9.620197527255436e-07, "loss": 1.6655, "step": 3450 }, { "epoch": 0.12683749404303676, "grad_norm": 1.2402171846377348, "learning_rate": 9.617989043962315e-07, "loss": 1.6349, "step": 3460 }, { "epoch": 0.1272040763957623, "grad_norm": 1.1586534075249035, "learning_rate": 9.615774413301201e-07, "loss": 1.6514, "step": 3470 }, { "epoch": 0.12757065874848786, "grad_norm": 1.3594354851138566, "learning_rate": 9.613553638220162e-07, "loss": 1.6516, "step": 3480 }, { "epoch": 0.12793724110121338, "grad_norm": 1.6613648157437189, "learning_rate": 9.611326721675447e-07, "loss": 1.6111, "step": 3490 }, { "epoch": 0.12830382345393893, "grad_norm": 1.1659314128590663, "learning_rate": 9.60909366663148e-07, "loss": 1.6144, "step": 3500 }, { "epoch": 0.12867040580666447, "grad_norm": 1.3825427999836462, "learning_rate": 9.606854476060858e-07, "loss": 1.6355, "step": 3510 }, { "epoch": 0.12903698815939002, "grad_norm": 1.3221664320987678, "learning_rate": 9.604609152944339e-07, "loss": 1.6582, "step": 3520 }, { "epoch": 0.12940357051211554, "grad_norm": 1.223865417664176, "learning_rate": 9.602357700270848e-07, "loss": 1.6629, "step": 3530 }, { "epoch": 0.1297701528648411, "grad_norm": 1.2654800350319806, "learning_rate": 9.600100121037478e-07, "loss": 1.6746, "step": 3540 }, { "epoch": 0.13013673521756663, "grad_norm": 1.5629673478694224, "learning_rate": 9.597836418249463e-07, "loss": 1.598, "step": 3550 }, { "epoch": 0.13050331757029215, "grad_norm": 1.434783120339992, "learning_rate": 9.5955665949202e-07, "loss": 1.6667, "step": 3560 }, { "epoch": 0.1308698999230177, "grad_norm": 1.391092196783546, "learning_rate": 9.593290654071227e-07, "loss": 1.6533, "step": 3570 }, { "epoch": 0.13123648227574325, "grad_norm": 1.4923072292703214, "learning_rate": 9.591008598732227e-07, "loss": 1.6742, "step": 3580 }, { "epoch": 0.1316030646284688, "grad_norm": 1.313620532521857, "learning_rate": 9.588720431941024e-07, "loss": 1.643, "step": 3590 }, { "epoch": 0.13196964698119432, "grad_norm": 1.527900388849829, "learning_rate": 9.586426156743576e-07, "loss": 1.6466, "step": 3600 }, { "epoch": 0.13233622933391986, "grad_norm": 1.3345529937125478, "learning_rate": 9.584125776193977e-07, "loss": 1.6242, "step": 3610 }, { "epoch": 0.1327028116866454, "grad_norm": 1.1722053149478573, "learning_rate": 9.581819293354437e-07, "loss": 1.6361, "step": 3620 }, { "epoch": 0.13306939403937096, "grad_norm": 1.448965551365503, "learning_rate": 9.579506711295303e-07, "loss": 1.6766, "step": 3630 }, { "epoch": 0.13343597639209648, "grad_norm": 1.435539195626326, "learning_rate": 9.57718803309503e-07, "loss": 1.6639, "step": 3640 }, { "epoch": 0.13380255874482203, "grad_norm": 1.5710598550118229, "learning_rate": 9.574863261840195e-07, "loss": 1.6821, "step": 3650 }, { "epoch": 0.13416914109754757, "grad_norm": 1.3432388820323078, "learning_rate": 9.572532400625486e-07, "loss": 1.6578, "step": 3660 }, { "epoch": 0.1345357234502731, "grad_norm": 1.4304292951831412, "learning_rate": 9.570195452553692e-07, "loss": 1.6683, "step": 3670 }, { "epoch": 0.13490230580299864, "grad_norm": 1.293030659950829, "learning_rate": 9.567852420735707e-07, "loss": 1.6712, "step": 3680 }, { "epoch": 0.1352688881557242, "grad_norm": 1.5727628914988818, "learning_rate": 9.565503308290529e-07, "loss": 1.6362, "step": 3690 }, { "epoch": 0.13563547050844973, "grad_norm": 1.6929875598843593, "learning_rate": 9.56314811834524e-07, "loss": 1.6734, "step": 3700 }, { "epoch": 0.13600205286117525, "grad_norm": 1.5989548687758315, "learning_rate": 9.560786854035027e-07, "loss": 1.6449, "step": 3710 }, { "epoch": 0.1363686352139008, "grad_norm": 1.5032676879166582, "learning_rate": 9.558419518503146e-07, "loss": 1.6572, "step": 3720 }, { "epoch": 0.13673521756662635, "grad_norm": 1.4171570128132858, "learning_rate": 9.55604611490095e-07, "loss": 1.6084, "step": 3730 }, { "epoch": 0.13710179991935187, "grad_norm": 1.445587424899926, "learning_rate": 9.553666646387859e-07, "loss": 1.6226, "step": 3740 }, { "epoch": 0.13746838227207742, "grad_norm": 1.3746442868420083, "learning_rate": 9.55128111613137e-07, "loss": 1.6244, "step": 3750 }, { "epoch": 0.13783496462480296, "grad_norm": 1.379515983296158, "learning_rate": 9.548889527307052e-07, "loss": 1.6178, "step": 3760 }, { "epoch": 0.1382015469775285, "grad_norm": 1.3571114141269711, "learning_rate": 9.546491883098536e-07, "loss": 1.6295, "step": 3770 }, { "epoch": 0.13856812933025403, "grad_norm": 1.463273179907825, "learning_rate": 9.544088186697514e-07, "loss": 1.6252, "step": 3780 }, { "epoch": 0.13893471168297958, "grad_norm": 1.409249057690562, "learning_rate": 9.541678441303736e-07, "loss": 1.6226, "step": 3790 }, { "epoch": 0.13930129403570513, "grad_norm": 1.2549772425250405, "learning_rate": 9.539262650125003e-07, "loss": 1.6904, "step": 3800 }, { "epoch": 0.13966787638843067, "grad_norm": 1.398529314496367, "learning_rate": 9.536840816377163e-07, "loss": 1.641, "step": 3810 }, { "epoch": 0.1400344587411562, "grad_norm": 1.4089240361542354, "learning_rate": 9.534412943284111e-07, "loss": 1.6749, "step": 3820 }, { "epoch": 0.14040104109388174, "grad_norm": 1.2690921990550241, "learning_rate": 9.53197903407778e-07, "loss": 1.6483, "step": 3830 }, { "epoch": 0.1407676234466073, "grad_norm": 1.443019453596183, "learning_rate": 9.529539091998138e-07, "loss": 1.5942, "step": 3840 }, { "epoch": 0.1411342057993328, "grad_norm": 1.3973353826502415, "learning_rate": 9.527093120293179e-07, "loss": 1.6637, "step": 3850 }, { "epoch": 0.14150078815205835, "grad_norm": 1.612241752672322, "learning_rate": 9.524641122218934e-07, "loss": 1.6144, "step": 3860 }, { "epoch": 0.1418673705047839, "grad_norm": 1.6392078912198202, "learning_rate": 9.522183101039447e-07, "loss": 1.599, "step": 3870 }, { "epoch": 0.14223395285750945, "grad_norm": 1.3307238721886945, "learning_rate": 9.519719060026784e-07, "loss": 1.6692, "step": 3880 }, { "epoch": 0.14260053521023497, "grad_norm": 1.3570795255125636, "learning_rate": 9.517249002461023e-07, "loss": 1.6871, "step": 3890 }, { "epoch": 0.14296711756296052, "grad_norm": 1.4037736413570712, "learning_rate": 9.514772931630253e-07, "loss": 1.5922, "step": 3900 }, { "epoch": 0.14333369991568606, "grad_norm": 1.6691508908927133, "learning_rate": 9.512290850830564e-07, "loss": 1.5939, "step": 3910 }, { "epoch": 0.1437002822684116, "grad_norm": 1.2746936442730004, "learning_rate": 9.509802763366052e-07, "loss": 1.6376, "step": 3920 }, { "epoch": 0.14406686462113713, "grad_norm": 1.7263750991736497, "learning_rate": 9.507308672548803e-07, "loss": 1.6251, "step": 3930 }, { "epoch": 0.14443344697386268, "grad_norm": 1.6162337099963227, "learning_rate": 9.504808581698898e-07, "loss": 1.6855, "step": 3940 }, { "epoch": 0.14480002932658823, "grad_norm": 1.4400774058967862, "learning_rate": 9.502302494144405e-07, "loss": 1.6688, "step": 3950 }, { "epoch": 0.14516661167931375, "grad_norm": 1.4106971014212684, "learning_rate": 9.499790413221372e-07, "loss": 1.6212, "step": 3960 }, { "epoch": 0.1455331940320393, "grad_norm": 1.549216443416639, "learning_rate": 9.49727234227383e-07, "loss": 1.6316, "step": 3970 }, { "epoch": 0.14589977638476484, "grad_norm": 1.2499725096259189, "learning_rate": 9.494748284653779e-07, "loss": 1.6113, "step": 3980 }, { "epoch": 0.1462663587374904, "grad_norm": 1.8429540203762498, "learning_rate": 9.492218243721192e-07, "loss": 1.6424, "step": 3990 }, { "epoch": 0.1466329410902159, "grad_norm": 1.4097823826329705, "learning_rate": 9.489682222844004e-07, "loss": 1.5986, "step": 4000 }, { "epoch": 0.1466329410902159, "eval_accuracy": 0.634133690356089, "eval_loss": 1.6327084302902222, "eval_runtime": 310.7367, "eval_samples_per_second": 10.642, "eval_steps_per_second": 0.888, "step": 4000 }, { "epoch": 0.14699952344294145, "grad_norm": 1.4923503061339742, "learning_rate": 9.487140225398112e-07, "loss": 1.6354, "step": 4010 }, { "epoch": 0.147366105795667, "grad_norm": 1.4794551483340477, "learning_rate": 9.484592254767368e-07, "loss": 1.6337, "step": 4020 }, { "epoch": 0.14773268814839255, "grad_norm": 1.5712257291796352, "learning_rate": 9.482038314343577e-07, "loss": 1.6569, "step": 4030 }, { "epoch": 0.14809927050111807, "grad_norm": 1.7977345143090582, "learning_rate": 9.479478407526489e-07, "loss": 1.6489, "step": 4040 }, { "epoch": 0.14846585285384362, "grad_norm": 1.3741458319499518, "learning_rate": 9.476912537723797e-07, "loss": 1.6133, "step": 4050 }, { "epoch": 0.14883243520656916, "grad_norm": 1.4690331639136838, "learning_rate": 9.474340708351131e-07, "loss": 1.6232, "step": 4060 }, { "epoch": 0.14919901755929468, "grad_norm": 1.2959341038239927, "learning_rate": 9.471762922832059e-07, "loss": 1.6136, "step": 4070 }, { "epoch": 0.14956559991202023, "grad_norm": 1.3662274482371721, "learning_rate": 9.469179184598068e-07, "loss": 1.6568, "step": 4080 }, { "epoch": 0.14993218226474578, "grad_norm": 1.6303487241504246, "learning_rate": 9.46658949708858e-07, "loss": 1.5929, "step": 4090 }, { "epoch": 0.15029876461747133, "grad_norm": 1.5690296034603222, "learning_rate": 9.463993863750927e-07, "loss": 1.6273, "step": 4100 }, { "epoch": 0.15066534697019685, "grad_norm": 1.4565888691647535, "learning_rate": 9.461392288040364e-07, "loss": 1.6111, "step": 4110 }, { "epoch": 0.1510319293229224, "grad_norm": 1.3399651168141258, "learning_rate": 9.458784773420052e-07, "loss": 1.6317, "step": 4120 }, { "epoch": 0.15139851167564794, "grad_norm": 1.4314663401678571, "learning_rate": 9.456171323361057e-07, "loss": 1.6149, "step": 4130 }, { "epoch": 0.1517650940283735, "grad_norm": 1.8610614612324794, "learning_rate": 9.45355194134235e-07, "loss": 1.6129, "step": 4140 }, { "epoch": 0.152131676381099, "grad_norm": 1.4894532553388709, "learning_rate": 9.450926630850795e-07, "loss": 1.609, "step": 4150 }, { "epoch": 0.15249825873382455, "grad_norm": 1.4046406522547454, "learning_rate": 9.44829539538115e-07, "loss": 1.5696, "step": 4160 }, { "epoch": 0.1528648410865501, "grad_norm": 1.507747542986857, "learning_rate": 9.445658238436056e-07, "loss": 1.6105, "step": 4170 }, { "epoch": 0.15323142343927562, "grad_norm": 1.5105255618831799, "learning_rate": 9.443015163526043e-07, "loss": 1.6656, "step": 4180 }, { "epoch": 0.15359800579200117, "grad_norm": 1.409667843388443, "learning_rate": 9.440366174169514e-07, "loss": 1.6143, "step": 4190 }, { "epoch": 0.15396458814472672, "grad_norm": 1.4899089219548238, "learning_rate": 9.437711273892748e-07, "loss": 1.6434, "step": 4200 }, { "epoch": 0.15433117049745226, "grad_norm": 1.3835730704800184, "learning_rate": 9.435050466229892e-07, "loss": 1.5896, "step": 4210 }, { "epoch": 0.15469775285017778, "grad_norm": 1.5192649294767298, "learning_rate": 9.432383754722953e-07, "loss": 1.5982, "step": 4220 }, { "epoch": 0.15506433520290333, "grad_norm": 1.414847151501446, "learning_rate": 9.429711142921804e-07, "loss": 1.6195, "step": 4230 }, { "epoch": 0.15543091755562888, "grad_norm": 1.6343731391974052, "learning_rate": 9.427032634384166e-07, "loss": 1.6571, "step": 4240 }, { "epoch": 0.15579749990835443, "grad_norm": 1.3341873108704791, "learning_rate": 9.424348232675612e-07, "loss": 1.6592, "step": 4250 }, { "epoch": 0.15616408226107995, "grad_norm": 1.6008064117545706, "learning_rate": 9.421657941369561e-07, "loss": 1.5976, "step": 4260 }, { "epoch": 0.1565306646138055, "grad_norm": 1.5239464972441716, "learning_rate": 9.418961764047271e-07, "loss": 1.6696, "step": 4270 }, { "epoch": 0.15689724696653104, "grad_norm": 1.4769248460119957, "learning_rate": 9.416259704297836e-07, "loss": 1.5887, "step": 4280 }, { "epoch": 0.15726382931925656, "grad_norm": 1.5681596592695635, "learning_rate": 9.413551765718178e-07, "loss": 1.6013, "step": 4290 }, { "epoch": 0.1576304116719821, "grad_norm": 1.631287334977878, "learning_rate": 9.410837951913049e-07, "loss": 1.5945, "step": 4300 }, { "epoch": 0.15799699402470765, "grad_norm": 1.4050312863210865, "learning_rate": 9.408118266495019e-07, "loss": 1.6402, "step": 4310 }, { "epoch": 0.1583635763774332, "grad_norm": 1.5578526902775003, "learning_rate": 9.405392713084475e-07, "loss": 1.5887, "step": 4320 }, { "epoch": 0.15873015873015872, "grad_norm": 1.838536265304532, "learning_rate": 9.402661295309613e-07, "loss": 1.6579, "step": 4330 }, { "epoch": 0.15909674108288427, "grad_norm": 1.399860997384879, "learning_rate": 9.399924016806442e-07, "loss": 1.6393, "step": 4340 }, { "epoch": 0.15946332343560982, "grad_norm": 1.5068872354692342, "learning_rate": 9.397180881218764e-07, "loss": 1.615, "step": 4350 }, { "epoch": 0.15982990578833534, "grad_norm": 1.3780932641355175, "learning_rate": 9.394431892198187e-07, "loss": 1.5897, "step": 4360 }, { "epoch": 0.16019648814106088, "grad_norm": 1.3266983904985465, "learning_rate": 9.391677053404102e-07, "loss": 1.622, "step": 4370 }, { "epoch": 0.16056307049378643, "grad_norm": 1.620877234564149, "learning_rate": 9.388916368503695e-07, "loss": 1.5967, "step": 4380 }, { "epoch": 0.16092965284651198, "grad_norm": 1.4779982203811086, "learning_rate": 9.386149841171927e-07, "loss": 1.6698, "step": 4390 }, { "epoch": 0.1612962351992375, "grad_norm": 1.8674907963100393, "learning_rate": 9.38337747509154e-07, "loss": 1.587, "step": 4400 }, { "epoch": 0.16166281755196305, "grad_norm": 1.253158061665667, "learning_rate": 9.380599273953052e-07, "loss": 1.5428, "step": 4410 }, { "epoch": 0.1620293999046886, "grad_norm": 1.3525050799204679, "learning_rate": 9.37781524145474e-07, "loss": 1.6247, "step": 4420 }, { "epoch": 0.16239598225741414, "grad_norm": 1.4613300416955568, "learning_rate": 9.375025381302654e-07, "loss": 1.6224, "step": 4430 }, { "epoch": 0.16276256461013966, "grad_norm": 1.2944336505844816, "learning_rate": 9.372229697210592e-07, "loss": 1.6073, "step": 4440 }, { "epoch": 0.1631291469628652, "grad_norm": 1.5174622698952627, "learning_rate": 9.369428192900108e-07, "loss": 1.6071, "step": 4450 }, { "epoch": 0.16349572931559075, "grad_norm": 1.338534858401422, "learning_rate": 9.366620872100508e-07, "loss": 1.6601, "step": 4460 }, { "epoch": 0.16386231166831627, "grad_norm": 1.6728271928417346, "learning_rate": 9.363807738548834e-07, "loss": 1.551, "step": 4470 }, { "epoch": 0.16422889402104182, "grad_norm": 1.302057455107361, "learning_rate": 9.360988795989873e-07, "loss": 1.6131, "step": 4480 }, { "epoch": 0.16459547637376737, "grad_norm": 1.3688499844245678, "learning_rate": 9.358164048176136e-07, "loss": 1.6117, "step": 4490 }, { "epoch": 0.16496205872649292, "grad_norm": 1.8246828901080199, "learning_rate": 9.355333498867869e-07, "loss": 1.5894, "step": 4500 }, { "epoch": 0.16532864107921844, "grad_norm": 1.6028775096282735, "learning_rate": 9.352497151833038e-07, "loss": 1.614, "step": 4510 }, { "epoch": 0.16569522343194398, "grad_norm": 1.4820831927771527, "learning_rate": 9.349655010847329e-07, "loss": 1.6046, "step": 4520 }, { "epoch": 0.16606180578466953, "grad_norm": 1.7672157547664196, "learning_rate": 9.346807079694139e-07, "loss": 1.5998, "step": 4530 }, { "epoch": 0.16642838813739508, "grad_norm": 1.399533793932768, "learning_rate": 9.34395336216457e-07, "loss": 1.6209, "step": 4540 }, { "epoch": 0.1667949704901206, "grad_norm": 1.3639375879771105, "learning_rate": 9.341093862057432e-07, "loss": 1.6321, "step": 4550 }, { "epoch": 0.16716155284284615, "grad_norm": 1.5049904120253712, "learning_rate": 9.338228583179231e-07, "loss": 1.5531, "step": 4560 }, { "epoch": 0.1675281351955717, "grad_norm": 1.2985124195396522, "learning_rate": 9.335357529344162e-07, "loss": 1.5925, "step": 4570 }, { "epoch": 0.1678947175482972, "grad_norm": 1.6446327484619145, "learning_rate": 9.332480704374113e-07, "loss": 1.5926, "step": 4580 }, { "epoch": 0.16826129990102276, "grad_norm": 1.6322229820052805, "learning_rate": 9.329598112098649e-07, "loss": 1.6415, "step": 4590 }, { "epoch": 0.1686278822537483, "grad_norm": 1.4469690988313273, "learning_rate": 9.326709756355018e-07, "loss": 1.5885, "step": 4600 }, { "epoch": 0.16899446460647385, "grad_norm": 2.0102392352379415, "learning_rate": 9.323815640988135e-07, "loss": 1.559, "step": 4610 }, { "epoch": 0.16936104695919937, "grad_norm": 2.121900247865438, "learning_rate": 9.320915769850585e-07, "loss": 1.628, "step": 4620 }, { "epoch": 0.16972762931192492, "grad_norm": 1.6562713457587275, "learning_rate": 9.318010146802615e-07, "loss": 1.6442, "step": 4630 }, { "epoch": 0.17009421166465047, "grad_norm": 1.825933954099794, "learning_rate": 9.315098775712127e-07, "loss": 1.5848, "step": 4640 }, { "epoch": 0.17046079401737602, "grad_norm": 2.2902161148174445, "learning_rate": 9.312181660454677e-07, "loss": 1.5825, "step": 4650 }, { "epoch": 0.17082737637010154, "grad_norm": 1.392734199429953, "learning_rate": 9.309258804913465e-07, "loss": 1.6126, "step": 4660 }, { "epoch": 0.17119395872282708, "grad_norm": 1.565256666892175, "learning_rate": 9.306330212979334e-07, "loss": 1.6022, "step": 4670 }, { "epoch": 0.17156054107555263, "grad_norm": 1.7600380550932417, "learning_rate": 9.303395888550763e-07, "loss": 1.5663, "step": 4680 }, { "epoch": 0.17192712342827815, "grad_norm": 1.5247880984614344, "learning_rate": 9.300455835533863e-07, "loss": 1.6012, "step": 4690 }, { "epoch": 0.1722937057810037, "grad_norm": 1.7352070019598504, "learning_rate": 9.297510057842367e-07, "loss": 1.5681, "step": 4700 }, { "epoch": 0.17266028813372924, "grad_norm": 1.6435683033446582, "learning_rate": 9.294558559397633e-07, "loss": 1.6687, "step": 4710 }, { "epoch": 0.1730268704864548, "grad_norm": 1.3964234370853204, "learning_rate": 9.291601344128631e-07, "loss": 1.5829, "step": 4720 }, { "epoch": 0.1733934528391803, "grad_norm": 1.76715189072495, "learning_rate": 9.288638415971944e-07, "loss": 1.5724, "step": 4730 }, { "epoch": 0.17376003519190586, "grad_norm": 1.3087839062281306, "learning_rate": 9.285669778871758e-07, "loss": 1.6033, "step": 4740 }, { "epoch": 0.1741266175446314, "grad_norm": 1.7592015890177557, "learning_rate": 9.282695436779857e-07, "loss": 1.5787, "step": 4750 }, { "epoch": 0.17449319989735695, "grad_norm": 1.5281595493710598, "learning_rate": 9.279715393655625e-07, "loss": 1.5593, "step": 4760 }, { "epoch": 0.17485978225008247, "grad_norm": 1.738599325299021, "learning_rate": 9.276729653466029e-07, "loss": 1.5669, "step": 4770 }, { "epoch": 0.17522636460280802, "grad_norm": 1.594132633669574, "learning_rate": 9.273738220185624e-07, "loss": 1.623, "step": 4780 }, { "epoch": 0.17559294695553357, "grad_norm": 2.226861365359913, "learning_rate": 9.27074109779654e-07, "loss": 1.6368, "step": 4790 }, { "epoch": 0.1759595293082591, "grad_norm": 1.7870988536401553, "learning_rate": 9.267738290288484e-07, "loss": 1.5905, "step": 4800 }, { "epoch": 0.17632611166098464, "grad_norm": 1.6753244560734581, "learning_rate": 9.264729801658726e-07, "loss": 1.588, "step": 4810 }, { "epoch": 0.17669269401371018, "grad_norm": 1.5163383708898754, "learning_rate": 9.261715635912105e-07, "loss": 1.6068, "step": 4820 }, { "epoch": 0.17705927636643573, "grad_norm": 1.6054513357762625, "learning_rate": 9.258695797061011e-07, "loss": 1.5623, "step": 4830 }, { "epoch": 0.17742585871916125, "grad_norm": 1.7549519455125482, "learning_rate": 9.255670289125392e-07, "loss": 1.6342, "step": 4840 }, { "epoch": 0.1777924410718868, "grad_norm": 1.5524081159338652, "learning_rate": 9.252639116132737e-07, "loss": 1.5866, "step": 4850 }, { "epoch": 0.17815902342461234, "grad_norm": 1.5466546969225983, "learning_rate": 9.249602282118078e-07, "loss": 1.6022, "step": 4860 }, { "epoch": 0.1785256057773379, "grad_norm": 1.4959615382996556, "learning_rate": 9.246559791123984e-07, "loss": 1.6196, "step": 4870 }, { "epoch": 0.1788921881300634, "grad_norm": 1.4914720900146645, "learning_rate": 9.243511647200554e-07, "loss": 1.5919, "step": 4880 }, { "epoch": 0.17925877048278896, "grad_norm": 1.5337435868741187, "learning_rate": 9.240457854405411e-07, "loss": 1.6044, "step": 4890 }, { "epoch": 0.1796253528355145, "grad_norm": 1.6816858785763387, "learning_rate": 9.237398416803702e-07, "loss": 1.5634, "step": 4900 }, { "epoch": 0.17999193518824003, "grad_norm": 1.8428666379108207, "learning_rate": 9.234333338468079e-07, "loss": 1.5595, "step": 4910 }, { "epoch": 0.18035851754096557, "grad_norm": 1.4112423758680814, "learning_rate": 9.231262623478712e-07, "loss": 1.5958, "step": 4920 }, { "epoch": 0.18072509989369112, "grad_norm": 1.9379415330464052, "learning_rate": 9.228186275923271e-07, "loss": 1.6132, "step": 4930 }, { "epoch": 0.18109168224641667, "grad_norm": 1.6478659028610085, "learning_rate": 9.225104299896923e-07, "loss": 1.5253, "step": 4940 }, { "epoch": 0.1814582645991422, "grad_norm": 1.4723128432871142, "learning_rate": 9.222016699502329e-07, "loss": 1.6025, "step": 4950 }, { "epoch": 0.18182484695186774, "grad_norm": 1.7186069161894069, "learning_rate": 9.218923478849636e-07, "loss": 1.5888, "step": 4960 }, { "epoch": 0.18219142930459328, "grad_norm": 2.0518524516759706, "learning_rate": 9.215824642056473e-07, "loss": 1.6131, "step": 4970 }, { "epoch": 0.18255801165731883, "grad_norm": 1.7336503978028492, "learning_rate": 9.212720193247946e-07, "loss": 1.5725, "step": 4980 }, { "epoch": 0.18292459401004435, "grad_norm": 1.4722133429873332, "learning_rate": 9.209610136556629e-07, "loss": 1.5547, "step": 4990 }, { "epoch": 0.1832911763627699, "grad_norm": 1.6753596780660358, "learning_rate": 9.206494476122565e-07, "loss": 1.5997, "step": 5000 }, { "epoch": 0.1832911763627699, "eval_accuracy": 0.642745649510724, "eval_loss": 1.587723731994629, "eval_runtime": 309.6063, "eval_samples_per_second": 10.681, "eval_steps_per_second": 0.891, "step": 5000 }, { "epoch": 0.18365775871549544, "grad_norm": 1.5685677710443469, "learning_rate": 9.203373216093253e-07, "loss": 1.5679, "step": 5010 }, { "epoch": 0.18402434106822096, "grad_norm": 1.8335955057050302, "learning_rate": 9.200246360623647e-07, "loss": 1.5621, "step": 5020 }, { "epoch": 0.1843909234209465, "grad_norm": 1.522191845438261, "learning_rate": 9.19711391387615e-07, "loss": 1.5729, "step": 5030 }, { "epoch": 0.18475750577367206, "grad_norm": 1.6776006382527855, "learning_rate": 9.193975880020609e-07, "loss": 1.59, "step": 5040 }, { "epoch": 0.1851240881263976, "grad_norm": 1.626198881855077, "learning_rate": 9.190832263234307e-07, "loss": 1.5274, "step": 5050 }, { "epoch": 0.18549067047912313, "grad_norm": 1.7849118070867178, "learning_rate": 9.18768306770196e-07, "loss": 1.5976, "step": 5060 }, { "epoch": 0.18585725283184867, "grad_norm": 1.6492509263028217, "learning_rate": 9.184528297615706e-07, "loss": 1.574, "step": 5070 }, { "epoch": 0.18622383518457422, "grad_norm": 1.6650634512326183, "learning_rate": 9.181367957175111e-07, "loss": 1.6145, "step": 5080 }, { "epoch": 0.18659041753729974, "grad_norm": 1.728522905813247, "learning_rate": 9.178202050587152e-07, "loss": 1.623, "step": 5090 }, { "epoch": 0.1869569998900253, "grad_norm": 1.5996442049523565, "learning_rate": 9.175030582066215e-07, "loss": 1.5807, "step": 5100 }, { "epoch": 0.18732358224275084, "grad_norm": 2.127736796999369, "learning_rate": 9.17185355583409e-07, "loss": 1.6288, "step": 5110 }, { "epoch": 0.18769016459547638, "grad_norm": 1.7060344023543381, "learning_rate": 9.16867097611997e-07, "loss": 1.5706, "step": 5120 }, { "epoch": 0.1880567469482019, "grad_norm": 1.6633154215840553, "learning_rate": 9.165482847160433e-07, "loss": 1.6202, "step": 5130 }, { "epoch": 0.18842332930092745, "grad_norm": 2.008854546754292, "learning_rate": 9.162289173199449e-07, "loss": 1.5684, "step": 5140 }, { "epoch": 0.188789911653653, "grad_norm": 1.8267125273776432, "learning_rate": 9.159089958488368e-07, "loss": 1.5463, "step": 5150 }, { "epoch": 0.18915649400637854, "grad_norm": 1.5564239251002085, "learning_rate": 9.155885207285917e-07, "loss": 1.5432, "step": 5160 }, { "epoch": 0.18952307635910406, "grad_norm": 1.6146271060205803, "learning_rate": 9.152674923858192e-07, "loss": 1.5524, "step": 5170 }, { "epoch": 0.1898896587118296, "grad_norm": 1.5552810397285535, "learning_rate": 9.149459112478653e-07, "loss": 1.5704, "step": 5180 }, { "epoch": 0.19025624106455516, "grad_norm": 1.5384519496242604, "learning_rate": 9.146237777428119e-07, "loss": 1.5832, "step": 5190 }, { "epoch": 0.19062282341728068, "grad_norm": 2.017102331377888, "learning_rate": 9.143010922994761e-07, "loss": 1.5652, "step": 5200 }, { "epoch": 0.19098940577000623, "grad_norm": 1.8257390842642465, "learning_rate": 9.139778553474102e-07, "loss": 1.6286, "step": 5210 }, { "epoch": 0.19135598812273177, "grad_norm": 1.8375892545538077, "learning_rate": 9.136540673169e-07, "loss": 1.5999, "step": 5220 }, { "epoch": 0.19172257047545732, "grad_norm": 2.0587302949543327, "learning_rate": 9.133297286389652e-07, "loss": 1.5976, "step": 5230 }, { "epoch": 0.19208915282818284, "grad_norm": 2.011881523827466, "learning_rate": 9.130048397453586e-07, "loss": 1.5948, "step": 5240 }, { "epoch": 0.1924557351809084, "grad_norm": 1.8390608792602066, "learning_rate": 9.126794010685652e-07, "loss": 1.6149, "step": 5250 }, { "epoch": 0.19282231753363394, "grad_norm": 1.9246481251033047, "learning_rate": 9.123534130418022e-07, "loss": 1.5918, "step": 5260 }, { "epoch": 0.19318889988635948, "grad_norm": 1.716961973736044, "learning_rate": 9.120268760990177e-07, "loss": 1.5423, "step": 5270 }, { "epoch": 0.193555482239085, "grad_norm": 2.0653331266058053, "learning_rate": 9.116997906748906e-07, "loss": 1.5646, "step": 5280 }, { "epoch": 0.19392206459181055, "grad_norm": 1.518359023904073, "learning_rate": 9.113721572048303e-07, "loss": 1.5893, "step": 5290 }, { "epoch": 0.1942886469445361, "grad_norm": 1.5221964255305394, "learning_rate": 9.110439761249752e-07, "loss": 1.5944, "step": 5300 }, { "epoch": 0.19465522929726162, "grad_norm": 1.591016019300809, "learning_rate": 9.107152478721929e-07, "loss": 1.5957, "step": 5310 }, { "epoch": 0.19502181164998716, "grad_norm": 1.6048630337553804, "learning_rate": 9.103859728840797e-07, "loss": 1.5373, "step": 5320 }, { "epoch": 0.1953883940027127, "grad_norm": 1.8089344462427293, "learning_rate": 9.10056151598959e-07, "loss": 1.5484, "step": 5330 }, { "epoch": 0.19575497635543826, "grad_norm": 1.7077347921127968, "learning_rate": 9.097257844558821e-07, "loss": 1.5688, "step": 5340 }, { "epoch": 0.19612155870816378, "grad_norm": 2.0584080275062706, "learning_rate": 9.093948718946265e-07, "loss": 1.5202, "step": 5350 }, { "epoch": 0.19648814106088933, "grad_norm": 1.6275162784009292, "learning_rate": 9.090634143556961e-07, "loss": 1.5851, "step": 5360 }, { "epoch": 0.19685472341361487, "grad_norm": 1.7941515009032263, "learning_rate": 9.087314122803198e-07, "loss": 1.5794, "step": 5370 }, { "epoch": 0.19722130576634042, "grad_norm": 1.72604148825101, "learning_rate": 9.083988661104519e-07, "loss": 1.5966, "step": 5380 }, { "epoch": 0.19758788811906594, "grad_norm": 1.7824620622659664, "learning_rate": 9.080657762887706e-07, "loss": 1.5893, "step": 5390 }, { "epoch": 0.1979544704717915, "grad_norm": 1.710078177829696, "learning_rate": 9.077321432586779e-07, "loss": 1.5668, "step": 5400 }, { "epoch": 0.19832105282451704, "grad_norm": 1.8516264946489545, "learning_rate": 9.073979674642991e-07, "loss": 1.6049, "step": 5410 }, { "epoch": 0.19868763517724256, "grad_norm": 2.1561627747886583, "learning_rate": 9.070632493504815e-07, "loss": 1.585, "step": 5420 }, { "epoch": 0.1990542175299681, "grad_norm": 1.912041110250784, "learning_rate": 9.06727989362795e-07, "loss": 1.5196, "step": 5430 }, { "epoch": 0.19942079988269365, "grad_norm": 1.8404077118276456, "learning_rate": 9.063921879475306e-07, "loss": 1.611, "step": 5440 }, { "epoch": 0.1997873822354192, "grad_norm": 1.5865821224681815, "learning_rate": 9.060558455516996e-07, "loss": 1.5739, "step": 5450 }, { "epoch": 0.20015396458814472, "grad_norm": 1.9756512969668862, "learning_rate": 9.057189626230341e-07, "loss": 1.5002, "step": 5460 }, { "epoch": 0.20052054694087026, "grad_norm": 1.5812577707350812, "learning_rate": 9.053815396099851e-07, "loss": 1.5869, "step": 5470 }, { "epoch": 0.2008871292935958, "grad_norm": 2.0162867580185555, "learning_rate": 9.050435769617231e-07, "loss": 1.5559, "step": 5480 }, { "epoch": 0.20125371164632136, "grad_norm": 1.899649598636165, "learning_rate": 9.047050751281368e-07, "loss": 1.5407, "step": 5490 }, { "epoch": 0.20162029399904688, "grad_norm": 1.9101266806326496, "learning_rate": 9.043660345598322e-07, "loss": 1.5576, "step": 5500 }, { "epoch": 0.20198687635177243, "grad_norm": 2.0420669589479403, "learning_rate": 9.040264557081334e-07, "loss": 1.557, "step": 5510 }, { "epoch": 0.20235345870449797, "grad_norm": 1.9260883055795428, "learning_rate": 9.036863390250801e-07, "loss": 1.5521, "step": 5520 }, { "epoch": 0.2027200410572235, "grad_norm": 1.6555197284342995, "learning_rate": 9.033456849634284e-07, "loss": 1.5717, "step": 5530 }, { "epoch": 0.20308662340994904, "grad_norm": 2.153362825776131, "learning_rate": 9.030044939766497e-07, "loss": 1.5713, "step": 5540 }, { "epoch": 0.2034532057626746, "grad_norm": 1.910089724316295, "learning_rate": 9.026627665189303e-07, "loss": 1.5697, "step": 5550 }, { "epoch": 0.20381978811540014, "grad_norm": 1.7762617538543, "learning_rate": 9.0232050304517e-07, "loss": 1.5239, "step": 5560 }, { "epoch": 0.20418637046812566, "grad_norm": 1.7174298843577596, "learning_rate": 9.019777040109831e-07, "loss": 1.5276, "step": 5570 }, { "epoch": 0.2045529528208512, "grad_norm": 1.6862369469038345, "learning_rate": 9.016343698726961e-07, "loss": 1.5541, "step": 5580 }, { "epoch": 0.20491953517357675, "grad_norm": 1.875834526669963, "learning_rate": 9.01290501087348e-07, "loss": 1.555, "step": 5590 }, { "epoch": 0.2052861175263023, "grad_norm": 1.7840227955187389, "learning_rate": 9.009460981126898e-07, "loss": 1.5872, "step": 5600 }, { "epoch": 0.20565269987902782, "grad_norm": 1.668168953110993, "learning_rate": 9.006011614071829e-07, "loss": 1.599, "step": 5610 }, { "epoch": 0.20601928223175336, "grad_norm": 1.6951419814826267, "learning_rate": 9.002556914300001e-07, "loss": 1.5599, "step": 5620 }, { "epoch": 0.2063858645844789, "grad_norm": 2.031183645077938, "learning_rate": 8.999096886410234e-07, "loss": 1.5697, "step": 5630 }, { "epoch": 0.20675244693720443, "grad_norm": 2.2433698552413595, "learning_rate": 8.995631535008442e-07, "loss": 1.5751, "step": 5640 }, { "epoch": 0.20711902928992998, "grad_norm": 1.96339871171306, "learning_rate": 8.992160864707629e-07, "loss": 1.5922, "step": 5650 }, { "epoch": 0.20748561164265553, "grad_norm": 1.7341008984989021, "learning_rate": 8.988684880127877e-07, "loss": 1.5476, "step": 5660 }, { "epoch": 0.20785219399538107, "grad_norm": 1.6011033018349554, "learning_rate": 8.985203585896339e-07, "loss": 1.5337, "step": 5670 }, { "epoch": 0.2082187763481066, "grad_norm": 1.804008259917083, "learning_rate": 8.981716986647241e-07, "loss": 1.548, "step": 5680 }, { "epoch": 0.20858535870083214, "grad_norm": 1.7644993504571036, "learning_rate": 8.978225087021872e-07, "loss": 1.5566, "step": 5690 }, { "epoch": 0.2089519410535577, "grad_norm": 2.1995890332913812, "learning_rate": 8.974727891668568e-07, "loss": 1.509, "step": 5700 }, { "epoch": 0.2093185234062832, "grad_norm": 1.7307439040874695, "learning_rate": 8.971225405242724e-07, "loss": 1.5792, "step": 5710 }, { "epoch": 0.20968510575900876, "grad_norm": 1.8843347719325225, "learning_rate": 8.967717632406775e-07, "loss": 1.5745, "step": 5720 }, { "epoch": 0.2100516881117343, "grad_norm": 1.8994279922279045, "learning_rate": 8.964204577830193e-07, "loss": 1.5346, "step": 5730 }, { "epoch": 0.21041827046445985, "grad_norm": 2.0146207080838305, "learning_rate": 8.960686246189479e-07, "loss": 1.5724, "step": 5740 }, { "epoch": 0.21078485281718537, "grad_norm": 1.9175010632666802, "learning_rate": 8.957162642168164e-07, "loss": 1.482, "step": 5750 }, { "epoch": 0.21115143516991092, "grad_norm": 1.6492564643172203, "learning_rate": 8.953633770456791e-07, "loss": 1.5635, "step": 5760 }, { "epoch": 0.21151801752263646, "grad_norm": 1.8913486368556613, "learning_rate": 8.950099635752919e-07, "loss": 1.5634, "step": 5770 }, { "epoch": 0.211884599875362, "grad_norm": 1.7405053491856226, "learning_rate": 8.946560242761114e-07, "loss": 1.5475, "step": 5780 }, { "epoch": 0.21225118222808753, "grad_norm": 1.7166883252641594, "learning_rate": 8.943015596192938e-07, "loss": 1.516, "step": 5790 }, { "epoch": 0.21261776458081308, "grad_norm": 1.935712334758643, "learning_rate": 8.93946570076695e-07, "loss": 1.5575, "step": 5800 }, { "epoch": 0.21298434693353863, "grad_norm": 1.9385604701128256, "learning_rate": 8.935910561208693e-07, "loss": 1.5634, "step": 5810 }, { "epoch": 0.21335092928626415, "grad_norm": 2.557688500744313, "learning_rate": 8.932350182250694e-07, "loss": 1.5103, "step": 5820 }, { "epoch": 0.2137175116389897, "grad_norm": 1.7120107495237882, "learning_rate": 8.928784568632454e-07, "loss": 1.5332, "step": 5830 }, { "epoch": 0.21408409399171524, "grad_norm": 1.9120958570178155, "learning_rate": 8.925213725100439e-07, "loss": 1.5902, "step": 5840 }, { "epoch": 0.2144506763444408, "grad_norm": 2.0551912368717984, "learning_rate": 8.921637656408081e-07, "loss": 1.5784, "step": 5850 }, { "epoch": 0.2148172586971663, "grad_norm": 1.9480411905431083, "learning_rate": 8.918056367315765e-07, "loss": 1.5551, "step": 5860 }, { "epoch": 0.21518384104989186, "grad_norm": 2.072902657734444, "learning_rate": 8.914469862590825e-07, "loss": 1.5555, "step": 5870 }, { "epoch": 0.2155504234026174, "grad_norm": 1.9451661388320578, "learning_rate": 8.910878147007544e-07, "loss": 1.5513, "step": 5880 }, { "epoch": 0.21591700575534295, "grad_norm": 2.0629785589418104, "learning_rate": 8.907281225347132e-07, "loss": 1.5553, "step": 5890 }, { "epoch": 0.21628358810806847, "grad_norm": 1.863954721076218, "learning_rate": 8.903679102397735e-07, "loss": 1.5691, "step": 5900 }, { "epoch": 0.21665017046079402, "grad_norm": 1.8545804685124208, "learning_rate": 8.900071782954424e-07, "loss": 1.5331, "step": 5910 }, { "epoch": 0.21701675281351956, "grad_norm": 1.8522158136831326, "learning_rate": 8.896459271819181e-07, "loss": 1.5481, "step": 5920 }, { "epoch": 0.21738333516624508, "grad_norm": 2.114169763199409, "learning_rate": 8.892841573800909e-07, "loss": 1.5574, "step": 5930 }, { "epoch": 0.21774991751897063, "grad_norm": 2.2195708048317897, "learning_rate": 8.889218693715405e-07, "loss": 1.5632, "step": 5940 }, { "epoch": 0.21811649987169618, "grad_norm": 1.9709151192601133, "learning_rate": 8.885590636385373e-07, "loss": 1.5861, "step": 5950 }, { "epoch": 0.21848308222442173, "grad_norm": 1.9808333239294875, "learning_rate": 8.881957406640402e-07, "loss": 1.5065, "step": 5960 }, { "epoch": 0.21884966457714725, "grad_norm": 2.442742784557856, "learning_rate": 8.878319009316973e-07, "loss": 1.5445, "step": 5970 }, { "epoch": 0.2192162469298728, "grad_norm": 2.311119780435353, "learning_rate": 8.874675449258439e-07, "loss": 1.5483, "step": 5980 }, { "epoch": 0.21958282928259834, "grad_norm": 2.0035864035930655, "learning_rate": 8.871026731315031e-07, "loss": 1.5516, "step": 5990 }, { "epoch": 0.2199494116353239, "grad_norm": 1.9235134048584597, "learning_rate": 8.867372860343843e-07, "loss": 1.5841, "step": 6000 }, { "epoch": 0.2199494116353239, "eval_accuracy": 0.6509060196907062, "eval_loss": 1.540500521659851, "eval_runtime": 311.0144, "eval_samples_per_second": 10.633, "eval_steps_per_second": 0.887, "step": 6000 }, { "epoch": 0.2203159939880494, "grad_norm": 1.7524109005789064, "learning_rate": 8.863713841208831e-07, "loss": 1.5597, "step": 6010 }, { "epoch": 0.22068257634077496, "grad_norm": 1.6692328056749952, "learning_rate": 8.860049678780803e-07, "loss": 1.4923, "step": 6020 }, { "epoch": 0.2210491586935005, "grad_norm": 1.9399213197528828, "learning_rate": 8.856380377937411e-07, "loss": 1.552, "step": 6030 }, { "epoch": 0.22141574104622602, "grad_norm": 2.2904467183798753, "learning_rate": 8.852705943563153e-07, "loss": 1.5254, "step": 6040 }, { "epoch": 0.22178232339895157, "grad_norm": 1.8153750134894717, "learning_rate": 8.849026380549354e-07, "loss": 1.5141, "step": 6050 }, { "epoch": 0.22214890575167712, "grad_norm": 2.618147882062693, "learning_rate": 8.84534169379417e-07, "loss": 1.5427, "step": 6060 }, { "epoch": 0.22251548810440266, "grad_norm": 1.7910988941866253, "learning_rate": 8.84165188820258e-07, "loss": 1.5024, "step": 6070 }, { "epoch": 0.22288207045712818, "grad_norm": 2.1174011777995565, "learning_rate": 8.837956968686371e-07, "loss": 1.5354, "step": 6080 }, { "epoch": 0.22324865280985373, "grad_norm": 1.9009206870385398, "learning_rate": 8.834256940164142e-07, "loss": 1.5147, "step": 6090 }, { "epoch": 0.22361523516257928, "grad_norm": 1.8496325535415874, "learning_rate": 8.830551807561291e-07, "loss": 1.5179, "step": 6100 }, { "epoch": 0.22398181751530483, "grad_norm": 1.662570964745413, "learning_rate": 8.826841575810011e-07, "loss": 1.5187, "step": 6110 }, { "epoch": 0.22434839986803035, "grad_norm": 1.8932960142147148, "learning_rate": 8.823126249849283e-07, "loss": 1.511, "step": 6120 }, { "epoch": 0.2247149822207559, "grad_norm": 2.055911875635135, "learning_rate": 8.819405834624869e-07, "loss": 1.5155, "step": 6130 }, { "epoch": 0.22508156457348144, "grad_norm": 2.0651755539958603, "learning_rate": 8.815680335089308e-07, "loss": 1.4753, "step": 6140 }, { "epoch": 0.22544814692620696, "grad_norm": 2.0717254734315405, "learning_rate": 8.811949756201902e-07, "loss": 1.5565, "step": 6150 }, { "epoch": 0.2258147292789325, "grad_norm": 1.9847422671401158, "learning_rate": 8.808214102928721e-07, "loss": 1.5438, "step": 6160 }, { "epoch": 0.22618131163165806, "grad_norm": 2.4190623603018806, "learning_rate": 8.804473380242583e-07, "loss": 1.5399, "step": 6170 }, { "epoch": 0.2265478939843836, "grad_norm": 2.20009570928599, "learning_rate": 8.80072759312306e-07, "loss": 1.5398, "step": 6180 }, { "epoch": 0.22691447633710912, "grad_norm": 1.9921790637181438, "learning_rate": 8.796976746556462e-07, "loss": 1.4771, "step": 6190 }, { "epoch": 0.22728105868983467, "grad_norm": 2.0203680363068344, "learning_rate": 8.793220845535838e-07, "loss": 1.5176, "step": 6200 }, { "epoch": 0.22764764104256022, "grad_norm": 2.7532988176359754, "learning_rate": 8.789459895060962e-07, "loss": 1.5371, "step": 6210 }, { "epoch": 0.22801422339528576, "grad_norm": 1.937352911027064, "learning_rate": 8.785693900138329e-07, "loss": 1.5356, "step": 6220 }, { "epoch": 0.22838080574801128, "grad_norm": 1.9964616803134492, "learning_rate": 8.781922865781151e-07, "loss": 1.56, "step": 6230 }, { "epoch": 0.22874738810073683, "grad_norm": 2.106377863408321, "learning_rate": 8.778146797009349e-07, "loss": 1.559, "step": 6240 }, { "epoch": 0.22911397045346238, "grad_norm": 1.6409859726466804, "learning_rate": 8.774365698849547e-07, "loss": 1.5116, "step": 6250 }, { "epoch": 0.2294805528061879, "grad_norm": 2.305691070208384, "learning_rate": 8.770579576335058e-07, "loss": 1.5683, "step": 6260 }, { "epoch": 0.22984713515891345, "grad_norm": 1.7207294769909895, "learning_rate": 8.766788434505887e-07, "loss": 1.4618, "step": 6270 }, { "epoch": 0.230213717511639, "grad_norm": 1.9323445658200624, "learning_rate": 8.762992278408723e-07, "loss": 1.5618, "step": 6280 }, { "epoch": 0.23058029986436454, "grad_norm": 1.999152732092489, "learning_rate": 8.759191113096927e-07, "loss": 1.5569, "step": 6290 }, { "epoch": 0.23094688221709006, "grad_norm": 1.8502749258838977, "learning_rate": 8.755384943630529e-07, "loss": 1.5114, "step": 6300 }, { "epoch": 0.2313134645698156, "grad_norm": 2.0061014414371003, "learning_rate": 8.751573775076219e-07, "loss": 1.5011, "step": 6310 }, { "epoch": 0.23168004692254116, "grad_norm": 2.064565021271191, "learning_rate": 8.747757612507345e-07, "loss": 1.5588, "step": 6320 }, { "epoch": 0.23204662927526667, "grad_norm": 1.878533236916369, "learning_rate": 8.743936461003898e-07, "loss": 1.5179, "step": 6330 }, { "epoch": 0.23241321162799222, "grad_norm": 2.080116702687917, "learning_rate": 8.740110325652515e-07, "loss": 1.5211, "step": 6340 }, { "epoch": 0.23277979398071777, "grad_norm": 2.2534624739469433, "learning_rate": 8.736279211546465e-07, "loss": 1.5077, "step": 6350 }, { "epoch": 0.23314637633344332, "grad_norm": 2.1778452457873527, "learning_rate": 8.732443123785644e-07, "loss": 1.5385, "step": 6360 }, { "epoch": 0.23351295868616884, "grad_norm": 2.0802562378092317, "learning_rate": 8.72860206747657e-07, "loss": 1.5053, "step": 6370 }, { "epoch": 0.23387954103889438, "grad_norm": 2.197133342414823, "learning_rate": 8.724756047732376e-07, "loss": 1.5223, "step": 6380 }, { "epoch": 0.23424612339161993, "grad_norm": 2.3786394596220437, "learning_rate": 8.720905069672799e-07, "loss": 1.5124, "step": 6390 }, { "epoch": 0.23461270574434548, "grad_norm": 1.8455501641424978, "learning_rate": 8.717049138424182e-07, "loss": 1.525, "step": 6400 }, { "epoch": 0.234979288097071, "grad_norm": 2.0418699202678727, "learning_rate": 8.713188259119452e-07, "loss": 1.5082, "step": 6410 }, { "epoch": 0.23534587044979655, "grad_norm": 1.8308136052916946, "learning_rate": 8.709322436898135e-07, "loss": 1.4779, "step": 6420 }, { "epoch": 0.2357124528025221, "grad_norm": 2.155105815758525, "learning_rate": 8.705451676906328e-07, "loss": 1.5101, "step": 6430 }, { "epoch": 0.2360790351552476, "grad_norm": 1.9647757860923412, "learning_rate": 8.701575984296702e-07, "loss": 1.5105, "step": 6440 }, { "epoch": 0.23644561750797316, "grad_norm": 2.051510082680593, "learning_rate": 8.6976953642285e-07, "loss": 1.503, "step": 6450 }, { "epoch": 0.2368121998606987, "grad_norm": 2.1386714707947534, "learning_rate": 8.693809821867517e-07, "loss": 1.5282, "step": 6460 }, { "epoch": 0.23717878221342426, "grad_norm": 2.1401411616284167, "learning_rate": 8.689919362386104e-07, "loss": 1.4949, "step": 6470 }, { "epoch": 0.23754536456614977, "grad_norm": 1.956666297999974, "learning_rate": 8.686023990963157e-07, "loss": 1.4993, "step": 6480 }, { "epoch": 0.23791194691887532, "grad_norm": 2.0257118859168672, "learning_rate": 8.682123712784112e-07, "loss": 1.5186, "step": 6490 }, { "epoch": 0.23827852927160087, "grad_norm": 1.895169068962553, "learning_rate": 8.678218533040937e-07, "loss": 1.526, "step": 6500 }, { "epoch": 0.23864511162432642, "grad_norm": 6.529056788123207, "learning_rate": 8.67430845693212e-07, "loss": 1.4975, "step": 6510 }, { "epoch": 0.23901169397705194, "grad_norm": 2.078820041783562, "learning_rate": 8.670393489662673e-07, "loss": 1.5147, "step": 6520 }, { "epoch": 0.23937827632977748, "grad_norm": 2.313941233193865, "learning_rate": 8.666473636444116e-07, "loss": 1.5103, "step": 6530 }, { "epoch": 0.23974485868250303, "grad_norm": 2.204068052979437, "learning_rate": 8.662548902494473e-07, "loss": 1.5197, "step": 6540 }, { "epoch": 0.24011144103522855, "grad_norm": 2.6677538134182033, "learning_rate": 8.658619293038265e-07, "loss": 1.4539, "step": 6550 }, { "epoch": 0.2404780233879541, "grad_norm": 2.1826711924398876, "learning_rate": 8.654684813306508e-07, "loss": 1.4569, "step": 6560 }, { "epoch": 0.24084460574067965, "grad_norm": 2.4513733249404037, "learning_rate": 8.650745468536691e-07, "loss": 1.472, "step": 6570 }, { "epoch": 0.2412111880934052, "grad_norm": 1.9341316559705668, "learning_rate": 8.64680126397279e-07, "loss": 1.5128, "step": 6580 }, { "epoch": 0.2415777704461307, "grad_norm": 2.2183441842361753, "learning_rate": 8.642852204865243e-07, "loss": 1.5409, "step": 6590 }, { "epoch": 0.24194435279885626, "grad_norm": 2.270638521627112, "learning_rate": 8.638898296470953e-07, "loss": 1.4992, "step": 6600 }, { "epoch": 0.2423109351515818, "grad_norm": 2.6732843475957146, "learning_rate": 8.634939544053279e-07, "loss": 1.5335, "step": 6610 }, { "epoch": 0.24267751750430736, "grad_norm": 1.9291920434342291, "learning_rate": 8.630975952882026e-07, "loss": 1.4627, "step": 6620 }, { "epoch": 0.24304409985703287, "grad_norm": 2.05169281240212, "learning_rate": 8.627007528233445e-07, "loss": 1.5257, "step": 6630 }, { "epoch": 0.24341068220975842, "grad_norm": 2.42497111676382, "learning_rate": 8.623034275390214e-07, "loss": 1.5445, "step": 6640 }, { "epoch": 0.24377726456248397, "grad_norm": 2.1919485638499903, "learning_rate": 8.619056199641444e-07, "loss": 1.5115, "step": 6650 }, { "epoch": 0.2441438469152095, "grad_norm": 2.3664261903908343, "learning_rate": 8.615073306282663e-07, "loss": 1.4846, "step": 6660 }, { "epoch": 0.24451042926793504, "grad_norm": 2.7278440906317387, "learning_rate": 8.611085600615812e-07, "loss": 1.5419, "step": 6670 }, { "epoch": 0.24487701162066058, "grad_norm": 2.326361941668607, "learning_rate": 8.607093087949244e-07, "loss": 1.5447, "step": 6680 }, { "epoch": 0.24524359397338613, "grad_norm": 2.101465809666948, "learning_rate": 8.603095773597702e-07, "loss": 1.5147, "step": 6690 }, { "epoch": 0.24561017632611165, "grad_norm": 2.121131443755951, "learning_rate": 8.599093662882326e-07, "loss": 1.5046, "step": 6700 }, { "epoch": 0.2459767586788372, "grad_norm": 2.004374535392673, "learning_rate": 8.595086761130641e-07, "loss": 1.5104, "step": 6710 }, { "epoch": 0.24634334103156275, "grad_norm": 2.330571487353144, "learning_rate": 8.591075073676548e-07, "loss": 1.489, "step": 6720 }, { "epoch": 0.2467099233842883, "grad_norm": 1.954097712061658, "learning_rate": 8.587058605860319e-07, "loss": 1.4628, "step": 6730 }, { "epoch": 0.2470765057370138, "grad_norm": 2.287871494329092, "learning_rate": 8.583037363028591e-07, "loss": 1.4966, "step": 6740 }, { "epoch": 0.24744308808973936, "grad_norm": 2.2507921472351837, "learning_rate": 8.579011350534355e-07, "loss": 1.5148, "step": 6750 }, { "epoch": 0.2478096704424649, "grad_norm": 2.2811051866364034, "learning_rate": 8.574980573736951e-07, "loss": 1.5123, "step": 6760 }, { "epoch": 0.24817625279519043, "grad_norm": 2.0762345472822106, "learning_rate": 8.570945038002066e-07, "loss": 1.5538, "step": 6770 }, { "epoch": 0.24854283514791597, "grad_norm": 2.0481616873032618, "learning_rate": 8.566904748701718e-07, "loss": 1.5162, "step": 6780 }, { "epoch": 0.24890941750064152, "grad_norm": 1.977911548805274, "learning_rate": 8.562859711214252e-07, "loss": 1.4945, "step": 6790 }, { "epoch": 0.24927599985336707, "grad_norm": 2.166946374211255, "learning_rate": 8.558809930924336e-07, "loss": 1.5143, "step": 6800 }, { "epoch": 0.2496425822060926, "grad_norm": 2.265635068798512, "learning_rate": 8.554755413222952e-07, "loss": 1.5079, "step": 6810 }, { "epoch": 0.25000916455881816, "grad_norm": 2.376856602321205, "learning_rate": 8.550696163507384e-07, "loss": 1.5187, "step": 6820 }, { "epoch": 0.2503757469115437, "grad_norm": 2.329411952961872, "learning_rate": 8.54663218718122e-07, "loss": 1.4985, "step": 6830 }, { "epoch": 0.2507423292642692, "grad_norm": 2.127867609490789, "learning_rate": 8.542563489654337e-07, "loss": 1.5249, "step": 6840 }, { "epoch": 0.2511089116169948, "grad_norm": 2.3846188422530545, "learning_rate": 8.5384900763429e-07, "loss": 1.5157, "step": 6850 }, { "epoch": 0.2514754939697203, "grad_norm": 1.9837481727043949, "learning_rate": 8.534411952669348e-07, "loss": 1.5185, "step": 6860 }, { "epoch": 0.2518420763224458, "grad_norm": 2.0300743472877776, "learning_rate": 8.530329124062392e-07, "loss": 1.4726, "step": 6870 }, { "epoch": 0.2522086586751714, "grad_norm": 3.41153757527899, "learning_rate": 8.526241595957007e-07, "loss": 1.482, "step": 6880 }, { "epoch": 0.2525752410278969, "grad_norm": 2.7170854102243043, "learning_rate": 8.52214937379442e-07, "loss": 1.4518, "step": 6890 }, { "epoch": 0.25294182338062243, "grad_norm": 2.5040883653748294, "learning_rate": 8.518052463022112e-07, "loss": 1.4506, "step": 6900 }, { "epoch": 0.253308405733348, "grad_norm": 2.1362380301717807, "learning_rate": 8.513950869093802e-07, "loss": 1.4975, "step": 6910 }, { "epoch": 0.2536749880860735, "grad_norm": 56.61497948468882, "learning_rate": 8.509844597469442e-07, "loss": 1.5211, "step": 6920 }, { "epoch": 0.2540415704387991, "grad_norm": 2.161248343347086, "learning_rate": 8.505733653615217e-07, "loss": 1.5123, "step": 6930 }, { "epoch": 0.2544081527915246, "grad_norm": 2.197831076147601, "learning_rate": 8.501618043003522e-07, "loss": 1.4735, "step": 6940 }, { "epoch": 0.25477473514425014, "grad_norm": 2.730731478650521, "learning_rate": 8.497497771112975e-07, "loss": 1.5154, "step": 6950 }, { "epoch": 0.2551413174969757, "grad_norm": 2.625261843658038, "learning_rate": 8.49337284342839e-07, "loss": 1.4642, "step": 6960 }, { "epoch": 0.25550789984970124, "grad_norm": 3.6302229703502302, "learning_rate": 8.489243265440785e-07, "loss": 1.4339, "step": 6970 }, { "epoch": 0.25587448220242676, "grad_norm": 2.2912655831406408, "learning_rate": 8.485109042647361e-07, "loss": 1.5021, "step": 6980 }, { "epoch": 0.25624106455515233, "grad_norm": 8.005970124630041, "learning_rate": 8.48097018055151e-07, "loss": 1.4777, "step": 6990 }, { "epoch": 0.25660764690787785, "grad_norm": 2.2515437376163097, "learning_rate": 8.476826684662797e-07, "loss": 1.5096, "step": 7000 }, { "epoch": 0.25660764690787785, "eval_accuracy": 0.6611285662580546, "eval_loss": 1.4870213270187378, "eval_runtime": 310.8369, "eval_samples_per_second": 10.639, "eval_steps_per_second": 0.888, "step": 7000 }, { "epoch": 0.25697422926060337, "grad_norm": 2.531506922529387, "learning_rate": 8.472678560496955e-07, "loss": 1.4718, "step": 7010 }, { "epoch": 0.25734081161332895, "grad_norm": 2.6738422568666778, "learning_rate": 8.468525813575875e-07, "loss": 1.4849, "step": 7020 }, { "epoch": 0.25770739396605447, "grad_norm": 2.3045631257315256, "learning_rate": 8.464368449427608e-07, "loss": 1.3982, "step": 7030 }, { "epoch": 0.25807397631878004, "grad_norm": 2.3127941331475586, "learning_rate": 8.460206473586347e-07, "loss": 1.4584, "step": 7040 }, { "epoch": 0.25844055867150556, "grad_norm": 2.624025522294039, "learning_rate": 8.456039891592424e-07, "loss": 1.5064, "step": 7050 }, { "epoch": 0.2588071410242311, "grad_norm": 2.4392755048359906, "learning_rate": 8.451868708992305e-07, "loss": 1.4744, "step": 7060 }, { "epoch": 0.25917372337695666, "grad_norm": 2.244873049339989, "learning_rate": 8.447692931338577e-07, "loss": 1.4866, "step": 7070 }, { "epoch": 0.2595403057296822, "grad_norm": 2.7693601328533846, "learning_rate": 8.443512564189947e-07, "loss": 1.4264, "step": 7080 }, { "epoch": 0.2599068880824077, "grad_norm": 2.18123288795935, "learning_rate": 8.439327613111231e-07, "loss": 1.4487, "step": 7090 }, { "epoch": 0.26027347043513327, "grad_norm": 2.770780437192883, "learning_rate": 8.435138083673343e-07, "loss": 1.5298, "step": 7100 }, { "epoch": 0.2606400527878588, "grad_norm": 2.2581904540642737, "learning_rate": 8.430943981453298e-07, "loss": 1.4801, "step": 7110 }, { "epoch": 0.2610066351405843, "grad_norm": 2.3222299759291674, "learning_rate": 8.426745312034192e-07, "loss": 1.4896, "step": 7120 }, { "epoch": 0.2613732174933099, "grad_norm": 2.0280868196158908, "learning_rate": 8.422542081005209e-07, "loss": 1.4466, "step": 7130 }, { "epoch": 0.2617397998460354, "grad_norm": 2.224282133830904, "learning_rate": 8.418334293961593e-07, "loss": 1.5286, "step": 7140 }, { "epoch": 0.262106382198761, "grad_norm": 2.223919368251033, "learning_rate": 8.414121956504665e-07, "loss": 1.5043, "step": 7150 }, { "epoch": 0.2624729645514865, "grad_norm": 2.505467964910925, "learning_rate": 8.409905074241796e-07, "loss": 1.4781, "step": 7160 }, { "epoch": 0.262839546904212, "grad_norm": 2.0986445187287077, "learning_rate": 8.405683652786411e-07, "loss": 1.4804, "step": 7170 }, { "epoch": 0.2632061292569376, "grad_norm": 2.490412539205642, "learning_rate": 8.401457697757972e-07, "loss": 1.518, "step": 7180 }, { "epoch": 0.2635727116096631, "grad_norm": 2.6915376209294917, "learning_rate": 8.397227214781983e-07, "loss": 1.4812, "step": 7190 }, { "epoch": 0.26393929396238863, "grad_norm": 2.3046153435535235, "learning_rate": 8.392992209489973e-07, "loss": 1.5159, "step": 7200 }, { "epoch": 0.2643058763151142, "grad_norm": 2.508127660367551, "learning_rate": 8.388752687519489e-07, "loss": 1.4451, "step": 7210 }, { "epoch": 0.2646724586678397, "grad_norm": 3.1862145718553245, "learning_rate": 8.384508654514091e-07, "loss": 1.4609, "step": 7220 }, { "epoch": 0.26503904102056525, "grad_norm": 2.5580838478505803, "learning_rate": 8.380260116123343e-07, "loss": 1.4331, "step": 7230 }, { "epoch": 0.2654056233732908, "grad_norm": 2.257862509636175, "learning_rate": 8.376007078002813e-07, "loss": 1.45, "step": 7240 }, { "epoch": 0.26577220572601634, "grad_norm": 2.288080123372639, "learning_rate": 8.371749545814051e-07, "loss": 1.4389, "step": 7250 }, { "epoch": 0.2661387880787419, "grad_norm": 2.396647723381076, "learning_rate": 8.367487525224592e-07, "loss": 1.4366, "step": 7260 }, { "epoch": 0.26650537043146744, "grad_norm": 2.2979084143372868, "learning_rate": 8.363221021907949e-07, "loss": 1.4818, "step": 7270 }, { "epoch": 0.26687195278419296, "grad_norm": 2.1808515998354694, "learning_rate": 8.358950041543598e-07, "loss": 1.4542, "step": 7280 }, { "epoch": 0.26723853513691853, "grad_norm": 2.230268806261455, "learning_rate": 8.354674589816977e-07, "loss": 1.4329, "step": 7290 }, { "epoch": 0.26760511748964405, "grad_norm": 2.927648869466954, "learning_rate": 8.350394672419474e-07, "loss": 1.5225, "step": 7300 }, { "epoch": 0.26797169984236957, "grad_norm": 2.112114910370922, "learning_rate": 8.346110295048425e-07, "loss": 1.4225, "step": 7310 }, { "epoch": 0.26833828219509515, "grad_norm": 2.660467378126346, "learning_rate": 8.341821463407101e-07, "loss": 1.5031, "step": 7320 }, { "epoch": 0.26870486454782067, "grad_norm": 3.003354330326063, "learning_rate": 8.337528183204704e-07, "loss": 1.4707, "step": 7330 }, { "epoch": 0.2690714469005462, "grad_norm": 2.623779251977545, "learning_rate": 8.333230460156355e-07, "loss": 1.4794, "step": 7340 }, { "epoch": 0.26943802925327176, "grad_norm": 3.101895766048754, "learning_rate": 8.32892829998309e-07, "loss": 1.4667, "step": 7350 }, { "epoch": 0.2698046116059973, "grad_norm": 2.960369047027641, "learning_rate": 8.324621708411854e-07, "loss": 1.5522, "step": 7360 }, { "epoch": 0.2701711939587228, "grad_norm": 2.524100342925903, "learning_rate": 8.320310691175489e-07, "loss": 1.4526, "step": 7370 }, { "epoch": 0.2705377763114484, "grad_norm": 2.62363195310582, "learning_rate": 8.315995254012726e-07, "loss": 1.4018, "step": 7380 }, { "epoch": 0.2709043586641739, "grad_norm": 1.9920146887682115, "learning_rate": 8.311675402668188e-07, "loss": 1.3965, "step": 7390 }, { "epoch": 0.27127094101689947, "grad_norm": 2.18110821192289, "learning_rate": 8.307351142892364e-07, "loss": 1.4842, "step": 7400 }, { "epoch": 0.271637523369625, "grad_norm": 2.2188567896520497, "learning_rate": 8.303022480441617e-07, "loss": 1.4159, "step": 7410 }, { "epoch": 0.2720041057223505, "grad_norm": 2.858166839750072, "learning_rate": 8.298689421078171e-07, "loss": 1.3954, "step": 7420 }, { "epoch": 0.2723706880750761, "grad_norm": 2.740212521082454, "learning_rate": 8.294351970570099e-07, "loss": 1.4861, "step": 7430 }, { "epoch": 0.2727372704278016, "grad_norm": 3.419233012340433, "learning_rate": 8.290010134691326e-07, "loss": 1.4824, "step": 7440 }, { "epoch": 0.2731038527805271, "grad_norm": 2.4809215592986966, "learning_rate": 8.285663919221606e-07, "loss": 1.4938, "step": 7450 }, { "epoch": 0.2734704351332527, "grad_norm": 2.607478119047904, "learning_rate": 8.281313329946531e-07, "loss": 1.419, "step": 7460 }, { "epoch": 0.2738370174859782, "grad_norm": 2.8279213303777753, "learning_rate": 8.276958372657512e-07, "loss": 1.4801, "step": 7470 }, { "epoch": 0.27420359983870374, "grad_norm": 2.585541966605194, "learning_rate": 8.272599053151774e-07, "loss": 1.4154, "step": 7480 }, { "epoch": 0.2745701821914293, "grad_norm": 2.7236239018595336, "learning_rate": 8.268235377232351e-07, "loss": 1.4741, "step": 7490 }, { "epoch": 0.27493676454415483, "grad_norm": 2.2739375571211844, "learning_rate": 8.263867350708072e-07, "loss": 1.4447, "step": 7500 }, { "epoch": 0.2753033468968804, "grad_norm": 2.936703619541737, "learning_rate": 8.259494979393562e-07, "loss": 1.4811, "step": 7510 }, { "epoch": 0.2756699292496059, "grad_norm": 2.644051786280347, "learning_rate": 8.255118269109229e-07, "loss": 1.4359, "step": 7520 }, { "epoch": 0.27603651160233145, "grad_norm": 2.814370164816269, "learning_rate": 8.250737225681254e-07, "loss": 1.4697, "step": 7530 }, { "epoch": 0.276403093955057, "grad_norm": 2.7487477516640664, "learning_rate": 8.246351854941589e-07, "loss": 1.4677, "step": 7540 }, { "epoch": 0.27676967630778254, "grad_norm": 2.7840690479403807, "learning_rate": 8.241962162727946e-07, "loss": 1.462, "step": 7550 }, { "epoch": 0.27713625866050806, "grad_norm": 2.9784690105392366, "learning_rate": 8.237568154883788e-07, "loss": 1.4439, "step": 7560 }, { "epoch": 0.27750284101323364, "grad_norm": 2.8948634927350105, "learning_rate": 8.233169837258325e-07, "loss": 1.4705, "step": 7570 }, { "epoch": 0.27786942336595916, "grad_norm": 2.612491147603324, "learning_rate": 8.228767215706503e-07, "loss": 1.467, "step": 7580 }, { "epoch": 0.2782360057186847, "grad_norm": 2.8002040163179736, "learning_rate": 8.224360296088995e-07, "loss": 1.4573, "step": 7590 }, { "epoch": 0.27860258807141025, "grad_norm": 2.8029823959562155, "learning_rate": 8.219949084272201e-07, "loss": 1.4804, "step": 7600 }, { "epoch": 0.27896917042413577, "grad_norm": 2.6888372781846375, "learning_rate": 8.21553358612823e-07, "loss": 1.4633, "step": 7610 }, { "epoch": 0.27933575277686135, "grad_norm": 2.279721839418087, "learning_rate": 8.2111138075349e-07, "loss": 1.4713, "step": 7620 }, { "epoch": 0.27970233512958687, "grad_norm": 2.3829035564919807, "learning_rate": 8.206689754375724e-07, "loss": 1.4387, "step": 7630 }, { "epoch": 0.2800689174823124, "grad_norm": 3.7962407630882384, "learning_rate": 8.202261432539907e-07, "loss": 1.4025, "step": 7640 }, { "epoch": 0.28043549983503796, "grad_norm": 2.797043930833034, "learning_rate": 8.197828847922337e-07, "loss": 1.4576, "step": 7650 }, { "epoch": 0.2808020821877635, "grad_norm": 3.256545613051792, "learning_rate": 8.193392006423574e-07, "loss": 1.432, "step": 7660 }, { "epoch": 0.281168664540489, "grad_norm": 2.432668523438971, "learning_rate": 8.188950913949848e-07, "loss": 1.456, "step": 7670 }, { "epoch": 0.2815352468932146, "grad_norm": 2.4546993774133856, "learning_rate": 8.184505576413043e-07, "loss": 1.392, "step": 7680 }, { "epoch": 0.2819018292459401, "grad_norm": 3.0030506631971776, "learning_rate": 8.180055999730702e-07, "loss": 1.365, "step": 7690 }, { "epoch": 0.2822684115986656, "grad_norm": 2.9439493487762465, "learning_rate": 8.175602189826001e-07, "loss": 1.4292, "step": 7700 }, { "epoch": 0.2826349939513912, "grad_norm": 2.620909787731563, "learning_rate": 8.171144152627761e-07, "loss": 1.4251, "step": 7710 }, { "epoch": 0.2830015763041167, "grad_norm": 3.263683256322055, "learning_rate": 8.16668189407042e-07, "loss": 1.3899, "step": 7720 }, { "epoch": 0.2833681586568423, "grad_norm": 2.5437523385064953, "learning_rate": 8.162215420094045e-07, "loss": 1.3683, "step": 7730 }, { "epoch": 0.2837347410095678, "grad_norm": 2.4580551613838844, "learning_rate": 8.15774473664431e-07, "loss": 1.3732, "step": 7740 }, { "epoch": 0.2841013233622933, "grad_norm": 2.8279077970597184, "learning_rate": 8.153269849672493e-07, "loss": 1.419, "step": 7750 }, { "epoch": 0.2844679057150189, "grad_norm": 3.041958703900493, "learning_rate": 8.148790765135465e-07, "loss": 1.4356, "step": 7760 }, { "epoch": 0.2848344880677444, "grad_norm": 2.4582661578514426, "learning_rate": 8.144307488995689e-07, "loss": 1.4378, "step": 7770 }, { "epoch": 0.28520107042046994, "grad_norm": 2.8361019596271726, "learning_rate": 8.139820027221208e-07, "loss": 1.4111, "step": 7780 }, { "epoch": 0.2855676527731955, "grad_norm": 2.4415137770737427, "learning_rate": 8.135328385785631e-07, "loss": 1.4996, "step": 7790 }, { "epoch": 0.28593423512592103, "grad_norm": 2.1392002967653094, "learning_rate": 8.130832570668139e-07, "loss": 1.433, "step": 7800 }, { "epoch": 0.28630081747864655, "grad_norm": 3.061322031102369, "learning_rate": 8.126332587853462e-07, "loss": 1.4051, "step": 7810 }, { "epoch": 0.2866673998313721, "grad_norm": 3.2748819767509354, "learning_rate": 8.12182844333188e-07, "loss": 1.3863, "step": 7820 }, { "epoch": 0.28703398218409765, "grad_norm": 3.1866933217967603, "learning_rate": 8.117320143099216e-07, "loss": 1.4173, "step": 7830 }, { "epoch": 0.2874005645368232, "grad_norm": 2.9290211285749175, "learning_rate": 8.11280769315682e-07, "loss": 1.4395, "step": 7840 }, { "epoch": 0.28776714688954874, "grad_norm": 2.7212160772193474, "learning_rate": 8.108291099511571e-07, "loss": 1.4503, "step": 7850 }, { "epoch": 0.28813372924227426, "grad_norm": 2.3892746869258317, "learning_rate": 8.10377036817586e-07, "loss": 1.4368, "step": 7860 }, { "epoch": 0.28850031159499984, "grad_norm": 3.4107926691510277, "learning_rate": 8.099245505167589e-07, "loss": 1.4623, "step": 7870 }, { "epoch": 0.28886689394772536, "grad_norm": 3.1259277735027307, "learning_rate": 8.094716516510156e-07, "loss": 1.4412, "step": 7880 }, { "epoch": 0.2892334763004509, "grad_norm": 2.9135343767151154, "learning_rate": 8.090183408232459e-07, "loss": 1.4187, "step": 7890 }, { "epoch": 0.28960005865317645, "grad_norm": 3.30617041516701, "learning_rate": 8.085646186368867e-07, "loss": 1.4176, "step": 7900 }, { "epoch": 0.28996664100590197, "grad_norm": 3.1801194693312556, "learning_rate": 8.081104856959238e-07, "loss": 1.4534, "step": 7910 }, { "epoch": 0.2903332233586275, "grad_norm": 3.2431476470574983, "learning_rate": 8.07655942604889e-07, "loss": 1.3469, "step": 7920 }, { "epoch": 0.29069980571135307, "grad_norm": 3.1005913247685237, "learning_rate": 8.072009899688605e-07, "loss": 1.417, "step": 7930 }, { "epoch": 0.2910663880640786, "grad_norm": 2.953054099149132, "learning_rate": 8.067456283934614e-07, "loss": 1.4252, "step": 7940 }, { "epoch": 0.29143297041680416, "grad_norm": 2.6363992565855803, "learning_rate": 8.062898584848592e-07, "loss": 1.4499, "step": 7950 }, { "epoch": 0.2917995527695297, "grad_norm": 2.7290690238502635, "learning_rate": 8.05833680849765e-07, "loss": 1.4716, "step": 7960 }, { "epoch": 0.2921661351222552, "grad_norm": 3.21591143424738, "learning_rate": 8.053770960954328e-07, "loss": 1.3969, "step": 7970 }, { "epoch": 0.2925327174749808, "grad_norm": 3.8732639515812575, "learning_rate": 8.049201048296585e-07, "loss": 1.463, "step": 7980 }, { "epoch": 0.2928992998277063, "grad_norm": 2.9966394441630126, "learning_rate": 8.044627076607789e-07, "loss": 1.4545, "step": 7990 }, { "epoch": 0.2932658821804318, "grad_norm": 3.1577560282041017, "learning_rate": 8.040049051976713e-07, "loss": 1.4682, "step": 8000 }, { "epoch": 0.2932658821804318, "eval_accuracy": 0.6739903313977985, "eval_loss": 1.4271955490112305, "eval_runtime": 311.2156, "eval_samples_per_second": 10.626, "eval_steps_per_second": 0.887, "step": 8000 }, { "epoch": 0.2936324645331574, "grad_norm": 2.957786000444244, "learning_rate": 8.035466980497526e-07, "loss": 1.4592, "step": 8010 }, { "epoch": 0.2939990468858829, "grad_norm": 2.765279941343725, "learning_rate": 8.030880868269785e-07, "loss": 1.4404, "step": 8020 }, { "epoch": 0.29436562923860843, "grad_norm": 2.803405395861366, "learning_rate": 8.026290721398421e-07, "loss": 1.3642, "step": 8030 }, { "epoch": 0.294732211591334, "grad_norm": 3.134947642226663, "learning_rate": 8.02169654599374e-07, "loss": 1.4662, "step": 8040 }, { "epoch": 0.2950987939440595, "grad_norm": 3.3888445829207923, "learning_rate": 8.017098348171411e-07, "loss": 1.4092, "step": 8050 }, { "epoch": 0.2954653762967851, "grad_norm": 2.595961601811049, "learning_rate": 8.012496134052457e-07, "loss": 1.3772, "step": 8060 }, { "epoch": 0.2958319586495106, "grad_norm": 3.724884065568925, "learning_rate": 8.007889909763246e-07, "loss": 1.3862, "step": 8070 }, { "epoch": 0.29619854100223614, "grad_norm": 3.6608857589920754, "learning_rate": 8.003279681435482e-07, "loss": 1.444, "step": 8080 }, { "epoch": 0.2965651233549617, "grad_norm": 2.7154240671865213, "learning_rate": 7.998665455206206e-07, "loss": 1.4285, "step": 8090 }, { "epoch": 0.29693170570768723, "grad_norm": 2.7151538150939927, "learning_rate": 7.994047237217776e-07, "loss": 1.4489, "step": 8100 }, { "epoch": 0.29729828806041275, "grad_norm": 2.9729575587995742, "learning_rate": 7.989425033617863e-07, "loss": 1.4289, "step": 8110 }, { "epoch": 0.2976648704131383, "grad_norm": 3.298808013574498, "learning_rate": 7.984798850559447e-07, "loss": 1.4607, "step": 8120 }, { "epoch": 0.29803145276586385, "grad_norm": 3.1491445672684866, "learning_rate": 7.980168694200804e-07, "loss": 1.4097, "step": 8130 }, { "epoch": 0.29839803511858937, "grad_norm": 3.6399703354293007, "learning_rate": 7.975534570705497e-07, "loss": 1.3743, "step": 8140 }, { "epoch": 0.29876461747131494, "grad_norm": 3.2547493183004974, "learning_rate": 7.970896486242374e-07, "loss": 1.4346, "step": 8150 }, { "epoch": 0.29913119982404046, "grad_norm": 3.421650269839234, "learning_rate": 7.966254446985553e-07, "loss": 1.43, "step": 8160 }, { "epoch": 0.29949778217676604, "grad_norm": 3.797293850962011, "learning_rate": 7.961608459114416e-07, "loss": 1.4651, "step": 8170 }, { "epoch": 0.29986436452949156, "grad_norm": 3.5920498224364508, "learning_rate": 7.956958528813604e-07, "loss": 1.3738, "step": 8180 }, { "epoch": 0.3002309468822171, "grad_norm": 3.238482918382144, "learning_rate": 7.952304662273003e-07, "loss": 1.3987, "step": 8190 }, { "epoch": 0.30059752923494265, "grad_norm": 2.7498611423368176, "learning_rate": 7.947646865687742e-07, "loss": 1.4181, "step": 8200 }, { "epoch": 0.30096411158766817, "grad_norm": 4.031428344222072, "learning_rate": 7.942985145258179e-07, "loss": 1.4294, "step": 8210 }, { "epoch": 0.3013306939403937, "grad_norm": 2.643218639323195, "learning_rate": 7.938319507189894e-07, "loss": 1.4302, "step": 8220 }, { "epoch": 0.30169727629311927, "grad_norm": 3.1275133100531227, "learning_rate": 7.933649957693689e-07, "loss": 1.348, "step": 8230 }, { "epoch": 0.3020638586458448, "grad_norm": 3.521399879217592, "learning_rate": 7.928976502985565e-07, "loss": 1.4328, "step": 8240 }, { "epoch": 0.3024304409985703, "grad_norm": 3.1834120547065665, "learning_rate": 7.924299149286725e-07, "loss": 1.4742, "step": 8250 }, { "epoch": 0.3027970233512959, "grad_norm": 3.631213709741295, "learning_rate": 7.919617902823563e-07, "loss": 1.4068, "step": 8260 }, { "epoch": 0.3031636057040214, "grad_norm": 2.726938578010126, "learning_rate": 7.914932769827653e-07, "loss": 1.4359, "step": 8270 }, { "epoch": 0.303530188056747, "grad_norm": 3.7017959652425882, "learning_rate": 7.910243756535744e-07, "loss": 1.3344, "step": 8280 }, { "epoch": 0.3038967704094725, "grad_norm": 3.3417669291832066, "learning_rate": 7.90555086918975e-07, "loss": 1.4121, "step": 8290 }, { "epoch": 0.304263352762198, "grad_norm": 2.733351967687222, "learning_rate": 7.900854114036743e-07, "loss": 1.3732, "step": 8300 }, { "epoch": 0.3046299351149236, "grad_norm": 3.1756478835337476, "learning_rate": 7.89615349732894e-07, "loss": 1.4007, "step": 8310 }, { "epoch": 0.3049965174676491, "grad_norm": 3.238758242953075, "learning_rate": 7.891449025323703e-07, "loss": 1.4288, "step": 8320 }, { "epoch": 0.30536309982037463, "grad_norm": 2.6053607033892043, "learning_rate": 7.886740704283525e-07, "loss": 1.4156, "step": 8330 }, { "epoch": 0.3057296821731002, "grad_norm": 3.4053915363354417, "learning_rate": 7.88202854047602e-07, "loss": 1.3763, "step": 8340 }, { "epoch": 0.3060962645258257, "grad_norm": 3.715425460301463, "learning_rate": 7.877312540173922e-07, "loss": 1.4036, "step": 8350 }, { "epoch": 0.30646284687855124, "grad_norm": 2.9427971805533697, "learning_rate": 7.872592709655066e-07, "loss": 1.4385, "step": 8360 }, { "epoch": 0.3068294292312768, "grad_norm": 3.5845846532616426, "learning_rate": 7.867869055202392e-07, "loss": 1.415, "step": 8370 }, { "epoch": 0.30719601158400234, "grad_norm": 3.331222139254396, "learning_rate": 7.863141583103927e-07, "loss": 1.4126, "step": 8380 }, { "epoch": 0.3075625939367279, "grad_norm": 3.1984388430808406, "learning_rate": 7.85841029965278e-07, "loss": 1.3826, "step": 8390 }, { "epoch": 0.30792917628945343, "grad_norm": 3.1255012278404615, "learning_rate": 7.853675211147134e-07, "loss": 1.383, "step": 8400 }, { "epoch": 0.30829575864217895, "grad_norm": 3.329583698840508, "learning_rate": 7.848936323890239e-07, "loss": 1.3931, "step": 8410 }, { "epoch": 0.3086623409949045, "grad_norm": 3.9347250968462055, "learning_rate": 7.844193644190396e-07, "loss": 1.415, "step": 8420 }, { "epoch": 0.30902892334763005, "grad_norm": 4.137255951707039, "learning_rate": 7.839447178360963e-07, "loss": 1.3998, "step": 8430 }, { "epoch": 0.30939550570035557, "grad_norm": 2.6794621566293917, "learning_rate": 7.834696932720331e-07, "loss": 1.4228, "step": 8440 }, { "epoch": 0.30976208805308114, "grad_norm": 2.726588078339754, "learning_rate": 7.829942913591925e-07, "loss": 1.4486, "step": 8450 }, { "epoch": 0.31012867040580666, "grad_norm": 3.6162463016794026, "learning_rate": 7.825185127304194e-07, "loss": 1.4051, "step": 8460 }, { "epoch": 0.3104952527585322, "grad_norm": 2.910711368055256, "learning_rate": 7.820423580190603e-07, "loss": 1.41, "step": 8470 }, { "epoch": 0.31086183511125776, "grad_norm": 4.136385316326493, "learning_rate": 7.815658278589619e-07, "loss": 1.3859, "step": 8480 }, { "epoch": 0.3112284174639833, "grad_norm": 2.1538443576824404, "learning_rate": 7.810889228844708e-07, "loss": 1.4113, "step": 8490 }, { "epoch": 0.31159499981670885, "grad_norm": 3.1055419264140727, "learning_rate": 7.806116437304331e-07, "loss": 1.4327, "step": 8500 }, { "epoch": 0.31196158216943437, "grad_norm": 3.183052960747229, "learning_rate": 7.801339910321922e-07, "loss": 1.4179, "step": 8510 }, { "epoch": 0.3123281645221599, "grad_norm": 4.6955784323633925, "learning_rate": 7.796559654255894e-07, "loss": 1.3961, "step": 8520 }, { "epoch": 0.31269474687488547, "grad_norm": 3.227174794853267, "learning_rate": 7.79177567546962e-07, "loss": 1.4082, "step": 8530 }, { "epoch": 0.313061329227611, "grad_norm": 2.8264595214995243, "learning_rate": 7.78698798033143e-07, "loss": 1.4136, "step": 8540 }, { "epoch": 0.3134279115803365, "grad_norm": 3.7915043909577624, "learning_rate": 7.782196575214601e-07, "loss": 1.3758, "step": 8550 }, { "epoch": 0.3137944939330621, "grad_norm": 4.070976938559408, "learning_rate": 7.777401466497349e-07, "loss": 1.3915, "step": 8560 }, { "epoch": 0.3141610762857876, "grad_norm": 3.3538502722425916, "learning_rate": 7.772602660562819e-07, "loss": 1.3718, "step": 8570 }, { "epoch": 0.3145276586385131, "grad_norm": 3.230342363406807, "learning_rate": 7.767800163799081e-07, "loss": 1.3408, "step": 8580 }, { "epoch": 0.3148942409912387, "grad_norm": 3.6144160833487415, "learning_rate": 7.762993982599113e-07, "loss": 1.4296, "step": 8590 }, { "epoch": 0.3152608233439642, "grad_norm": 3.1182771552970374, "learning_rate": 7.758184123360803e-07, "loss": 1.3858, "step": 8600 }, { "epoch": 0.3156274056966898, "grad_norm": 3.5319206230022977, "learning_rate": 7.75337059248693e-07, "loss": 1.4342, "step": 8610 }, { "epoch": 0.3159939880494153, "grad_norm": 4.327639493570607, "learning_rate": 7.748553396385163e-07, "loss": 1.3915, "step": 8620 }, { "epoch": 0.31636057040214083, "grad_norm": 3.9982142503751326, "learning_rate": 7.743732541468053e-07, "loss": 1.363, "step": 8630 }, { "epoch": 0.3167271527548664, "grad_norm": 2.8786530129074728, "learning_rate": 7.738908034153015e-07, "loss": 1.3589, "step": 8640 }, { "epoch": 0.3170937351075919, "grad_norm": 4.4947342914569095, "learning_rate": 7.734079880862333e-07, "loss": 1.3506, "step": 8650 }, { "epoch": 0.31746031746031744, "grad_norm": 3.1518608629753477, "learning_rate": 7.729248088023139e-07, "loss": 1.3847, "step": 8660 }, { "epoch": 0.317826899813043, "grad_norm": 3.8964914548994534, "learning_rate": 7.724412662067415e-07, "loss": 1.3616, "step": 8670 }, { "epoch": 0.31819348216576854, "grad_norm": 4.158332163473049, "learning_rate": 7.719573609431971e-07, "loss": 1.3477, "step": 8680 }, { "epoch": 0.31856006451849406, "grad_norm": 5.31244346458908, "learning_rate": 7.714730936558455e-07, "loss": 1.3885, "step": 8690 }, { "epoch": 0.31892664687121963, "grad_norm": 3.5750048314109946, "learning_rate": 7.709884649893328e-07, "loss": 1.3763, "step": 8700 }, { "epoch": 0.31929322922394515, "grad_norm": 3.5013927398683444, "learning_rate": 7.70503475588786e-07, "loss": 1.3437, "step": 8710 }, { "epoch": 0.31965981157667067, "grad_norm": 3.772854937898392, "learning_rate": 7.700181260998131e-07, "loss": 1.434, "step": 8720 }, { "epoch": 0.32002639392939625, "grad_norm": 3.939247516045474, "learning_rate": 7.695324171685004e-07, "loss": 1.384, "step": 8730 }, { "epoch": 0.32039297628212177, "grad_norm": 3.3160045433400334, "learning_rate": 7.690463494414137e-07, "loss": 1.3681, "step": 8740 }, { "epoch": 0.32075955863484734, "grad_norm": 3.2760601494452533, "learning_rate": 7.685599235655955e-07, "loss": 1.3576, "step": 8750 }, { "epoch": 0.32112614098757286, "grad_norm": 3.917398028616676, "learning_rate": 7.680731401885658e-07, "loss": 1.4109, "step": 8760 }, { "epoch": 0.3214927233402984, "grad_norm": 4.3801775022523355, "learning_rate": 7.675859999583202e-07, "loss": 1.3688, "step": 8770 }, { "epoch": 0.32185930569302396, "grad_norm": 3.52546033919284, "learning_rate": 7.670985035233291e-07, "loss": 1.3803, "step": 8780 }, { "epoch": 0.3222258880457495, "grad_norm": 3.4568824402601925, "learning_rate": 7.666106515325374e-07, "loss": 1.3615, "step": 8790 }, { "epoch": 0.322592470398475, "grad_norm": 2.7983015500958826, "learning_rate": 7.661224446353634e-07, "loss": 1.3767, "step": 8800 }, { "epoch": 0.32295905275120057, "grad_norm": 3.4581919245368904, "learning_rate": 7.656338834816976e-07, "loss": 1.3768, "step": 8810 }, { "epoch": 0.3233256351039261, "grad_norm": 3.7176544154346054, "learning_rate": 7.651449687219018e-07, "loss": 1.3312, "step": 8820 }, { "epoch": 0.3236922174566516, "grad_norm": 3.6712040176600502, "learning_rate": 7.646557010068091e-07, "loss": 1.3981, "step": 8830 }, { "epoch": 0.3240587998093772, "grad_norm": 2.8962404949789637, "learning_rate": 7.641660809877222e-07, "loss": 1.4085, "step": 8840 }, { "epoch": 0.3244253821621027, "grad_norm": 5.2069626245172635, "learning_rate": 7.636761093164126e-07, "loss": 1.3489, "step": 8850 }, { "epoch": 0.3247919645148283, "grad_norm": 3.3614052591604793, "learning_rate": 7.631857866451204e-07, "loss": 1.391, "step": 8860 }, { "epoch": 0.3251585468675538, "grad_norm": 3.1183008582079417, "learning_rate": 7.626951136265523e-07, "loss": 1.3966, "step": 8870 }, { "epoch": 0.3255251292202793, "grad_norm": 4.337276600886146, "learning_rate": 7.622040909138818e-07, "loss": 1.3566, "step": 8880 }, { "epoch": 0.3258917115730049, "grad_norm": 4.083650404603487, "learning_rate": 7.617127191607479e-07, "loss": 1.3928, "step": 8890 }, { "epoch": 0.3262582939257304, "grad_norm": 3.847428171873619, "learning_rate": 7.612209990212543e-07, "loss": 1.3259, "step": 8900 }, { "epoch": 0.32662487627845593, "grad_norm": 3.2197146488177384, "learning_rate": 7.607289311499678e-07, "loss": 1.376, "step": 8910 }, { "epoch": 0.3269914586311815, "grad_norm": 3.4983962191005373, "learning_rate": 7.60236516201919e-07, "loss": 1.3927, "step": 8920 }, { "epoch": 0.32735804098390703, "grad_norm": 3.610610377134006, "learning_rate": 7.597437548326002e-07, "loss": 1.3792, "step": 8930 }, { "epoch": 0.32772462333663255, "grad_norm": 5.095826376758547, "learning_rate": 7.592506476979644e-07, "loss": 1.358, "step": 8940 }, { "epoch": 0.3280912056893581, "grad_norm": 3.3863305431901183, "learning_rate": 7.587571954544254e-07, "loss": 1.3983, "step": 8950 }, { "epoch": 0.32845778804208364, "grad_norm": 3.5975350890244067, "learning_rate": 7.582633987588563e-07, "loss": 1.4057, "step": 8960 }, { "epoch": 0.3288243703948092, "grad_norm": 3.848485096118636, "learning_rate": 7.577692582685886e-07, "loss": 1.3814, "step": 8970 }, { "epoch": 0.32919095274753474, "grad_norm": 3.157404479059578, "learning_rate": 7.572747746414117e-07, "loss": 1.4095, "step": 8980 }, { "epoch": 0.32955753510026026, "grad_norm": 4.1043127446716285, "learning_rate": 7.567799485355715e-07, "loss": 1.3755, "step": 8990 }, { "epoch": 0.32992411745298583, "grad_norm": 3.7156219870736615, "learning_rate": 7.562847806097696e-07, "loss": 1.3526, "step": 9000 }, { "epoch": 0.32992411745298583, "eval_accuracy": 0.688625248964108, "eval_loss": 1.3686386346817017, "eval_runtime": 311.2444, "eval_samples_per_second": 10.625, "eval_steps_per_second": 0.887, "step": 9000 }, { "epoch": 0.33029069980571135, "grad_norm": 4.016168592808031, "learning_rate": 7.557892715231634e-07, "loss": 1.3607, "step": 9010 }, { "epoch": 0.33065728215843687, "grad_norm": 3.504820069720998, "learning_rate": 7.552934219353638e-07, "loss": 1.3833, "step": 9020 }, { "epoch": 0.33102386451116245, "grad_norm": 3.3563895186210875, "learning_rate": 7.547972325064351e-07, "loss": 1.393, "step": 9030 }, { "epoch": 0.33139044686388797, "grad_norm": 3.401944814988902, "learning_rate": 7.543007038968939e-07, "loss": 1.3708, "step": 9040 }, { "epoch": 0.3317570292166135, "grad_norm": 4.8917426491539935, "learning_rate": 7.538038367677087e-07, "loss": 1.329, "step": 9050 }, { "epoch": 0.33212361156933906, "grad_norm": 4.014824315681244, "learning_rate": 7.53306631780298e-07, "loss": 1.3464, "step": 9060 }, { "epoch": 0.3324901939220646, "grad_norm": 3.9395593086417637, "learning_rate": 7.52809089596531e-07, "loss": 1.4059, "step": 9070 }, { "epoch": 0.33285677627479016, "grad_norm": 3.5141323515233274, "learning_rate": 7.523112108787247e-07, "loss": 1.3467, "step": 9080 }, { "epoch": 0.3332233586275157, "grad_norm": 4.310837199551292, "learning_rate": 7.518129962896448e-07, "loss": 1.3432, "step": 9090 }, { "epoch": 0.3335899409802412, "grad_norm": 4.049279934012434, "learning_rate": 7.513144464925036e-07, "loss": 1.4107, "step": 9100 }, { "epoch": 0.33395652333296677, "grad_norm": 5.43599736913238, "learning_rate": 7.508155621509603e-07, "loss": 1.3779, "step": 9110 }, { "epoch": 0.3343231056856923, "grad_norm": 4.312594101718665, "learning_rate": 7.503163439291187e-07, "loss": 1.3279, "step": 9120 }, { "epoch": 0.3346896880384178, "grad_norm": 3.7888042986131794, "learning_rate": 7.498167924915276e-07, "loss": 1.3422, "step": 9130 }, { "epoch": 0.3350562703911434, "grad_norm": 4.6227274755808665, "learning_rate": 7.493169085031791e-07, "loss": 1.3489, "step": 9140 }, { "epoch": 0.3354228527438689, "grad_norm": 4.440746888404653, "learning_rate": 7.48816692629508e-07, "loss": 1.3955, "step": 9150 }, { "epoch": 0.3357894350965944, "grad_norm": 3.1422454499623753, "learning_rate": 7.483161455363909e-07, "loss": 1.3613, "step": 9160 }, { "epoch": 0.33615601744932, "grad_norm": 3.894653506327936, "learning_rate": 7.478152678901455e-07, "loss": 1.4148, "step": 9170 }, { "epoch": 0.3365225998020455, "grad_norm": 5.433033949859381, "learning_rate": 7.473140603575294e-07, "loss": 1.3144, "step": 9180 }, { "epoch": 0.3368891821547711, "grad_norm": 3.975951714183405, "learning_rate": 7.468125236057392e-07, "loss": 1.3691, "step": 9190 }, { "epoch": 0.3372557645074966, "grad_norm": 4.918343199781564, "learning_rate": 7.463106583024099e-07, "loss": 1.3848, "step": 9200 }, { "epoch": 0.33762234686022213, "grad_norm": 4.865872631877682, "learning_rate": 7.458084651156138e-07, "loss": 1.3612, "step": 9210 }, { "epoch": 0.3379889292129477, "grad_norm": 4.124355883120795, "learning_rate": 7.453059447138597e-07, "loss": 1.3922, "step": 9220 }, { "epoch": 0.33835551156567323, "grad_norm": 3.4927433175723968, "learning_rate": 7.448030977660921e-07, "loss": 1.3209, "step": 9230 }, { "epoch": 0.33872209391839875, "grad_norm": 3.5565740075352887, "learning_rate": 7.4429992494169e-07, "loss": 1.3137, "step": 9240 }, { "epoch": 0.3390886762711243, "grad_norm": 3.2292820179583335, "learning_rate": 7.437964269104663e-07, "loss": 1.3469, "step": 9250 }, { "epoch": 0.33945525862384984, "grad_norm": 5.260253752526274, "learning_rate": 7.432926043426668e-07, "loss": 1.3067, "step": 9260 }, { "epoch": 0.33982184097657536, "grad_norm": 4.394976349303848, "learning_rate": 7.427884579089691e-07, "loss": 1.3423, "step": 9270 }, { "epoch": 0.34018842332930094, "grad_norm": 3.396422180187779, "learning_rate": 7.422839882804825e-07, "loss": 1.3449, "step": 9280 }, { "epoch": 0.34055500568202646, "grad_norm": 4.387777704799267, "learning_rate": 7.417791961287457e-07, "loss": 1.3274, "step": 9290 }, { "epoch": 0.34092158803475203, "grad_norm": 4.664699242153168, "learning_rate": 7.412740821257275e-07, "loss": 1.3147, "step": 9300 }, { "epoch": 0.34128817038747755, "grad_norm": 3.393736360787831, "learning_rate": 7.407686469438248e-07, "loss": 1.3934, "step": 9310 }, { "epoch": 0.34165475274020307, "grad_norm": 4.750927708757991, "learning_rate": 7.40262891255862e-07, "loss": 1.4067, "step": 9320 }, { "epoch": 0.34202133509292865, "grad_norm": 3.428169411059033, "learning_rate": 7.397568157350903e-07, "loss": 1.3411, "step": 9330 }, { "epoch": 0.34238791744565417, "grad_norm": 4.302469394811799, "learning_rate": 7.392504210551865e-07, "loss": 1.299, "step": 9340 }, { "epoch": 0.3427544997983797, "grad_norm": 7.00981557963566, "learning_rate": 7.387437078902523e-07, "loss": 1.3573, "step": 9350 }, { "epoch": 0.34312108215110526, "grad_norm": 5.566063359486336, "learning_rate": 7.382366769148136e-07, "loss": 1.3497, "step": 9360 }, { "epoch": 0.3434876645038308, "grad_norm": 3.4660448886166244, "learning_rate": 7.37729328803819e-07, "loss": 1.4092, "step": 9370 }, { "epoch": 0.3438542468565563, "grad_norm": 3.702869545438875, "learning_rate": 7.372216642326394e-07, "loss": 1.3603, "step": 9380 }, { "epoch": 0.3442208292092819, "grad_norm": 4.231146103126798, "learning_rate": 7.367136838770671e-07, "loss": 1.3428, "step": 9390 }, { "epoch": 0.3445874115620074, "grad_norm": 4.554271919619236, "learning_rate": 7.362053884133146e-07, "loss": 1.3311, "step": 9400 }, { "epoch": 0.34495399391473297, "grad_norm": 4.041325390537124, "learning_rate": 7.35696778518014e-07, "loss": 1.3471, "step": 9410 }, { "epoch": 0.3453205762674585, "grad_norm": 5.283681695413367, "learning_rate": 7.351878548682155e-07, "loss": 1.3334, "step": 9420 }, { "epoch": 0.345687158620184, "grad_norm": 4.104429136831335, "learning_rate": 7.34678618141388e-07, "loss": 1.3443, "step": 9430 }, { "epoch": 0.3460537409729096, "grad_norm": 4.637839526253117, "learning_rate": 7.341690690154161e-07, "loss": 1.3383, "step": 9440 }, { "epoch": 0.3464203233256351, "grad_norm": 6.447434633082354, "learning_rate": 7.336592081686007e-07, "loss": 1.3769, "step": 9450 }, { "epoch": 0.3467869056783606, "grad_norm": 4.989354934531907, "learning_rate": 7.331490362796579e-07, "loss": 1.3651, "step": 9460 }, { "epoch": 0.3471534880310862, "grad_norm": 4.121285832330203, "learning_rate": 7.326385540277171e-07, "loss": 1.319, "step": 9470 }, { "epoch": 0.3475200703838117, "grad_norm": 3.7909593948348284, "learning_rate": 7.321277620923217e-07, "loss": 1.3743, "step": 9480 }, { "epoch": 0.34788665273653724, "grad_norm": 3.3733089497346853, "learning_rate": 7.316166611534267e-07, "loss": 1.3743, "step": 9490 }, { "epoch": 0.3482532350892628, "grad_norm": 3.7253741770570823, "learning_rate": 7.311052518913989e-07, "loss": 1.2903, "step": 9500 }, { "epoch": 0.34861981744198833, "grad_norm": 4.039793671210928, "learning_rate": 7.305935349870155e-07, "loss": 1.2862, "step": 9510 }, { "epoch": 0.3489863997947139, "grad_norm": 4.342535349346429, "learning_rate": 7.300815111214628e-07, "loss": 1.3808, "step": 9520 }, { "epoch": 0.34935298214743943, "grad_norm": 5.42799281760455, "learning_rate": 7.29569180976336e-07, "loss": 1.3523, "step": 9530 }, { "epoch": 0.34971956450016495, "grad_norm": 5.020277916958928, "learning_rate": 7.290565452336381e-07, "loss": 1.3256, "step": 9540 }, { "epoch": 0.3500861468528905, "grad_norm": 4.373712918374428, "learning_rate": 7.285436045757789e-07, "loss": 1.2827, "step": 9550 }, { "epoch": 0.35045272920561604, "grad_norm": 6.179796353095443, "learning_rate": 7.280303596855737e-07, "loss": 1.3197, "step": 9560 }, { "epoch": 0.35081931155834156, "grad_norm": 5.167300912494304, "learning_rate": 7.275168112462433e-07, "loss": 1.331, "step": 9570 }, { "epoch": 0.35118589391106714, "grad_norm": 4.118700000532668, "learning_rate": 7.270029599414125e-07, "loss": 1.3529, "step": 9580 }, { "epoch": 0.35155247626379266, "grad_norm": 3.6038833094843516, "learning_rate": 7.264888064551089e-07, "loss": 1.3258, "step": 9590 }, { "epoch": 0.3519190586165182, "grad_norm": 3.5142758374979524, "learning_rate": 7.259743514717627e-07, "loss": 1.3377, "step": 9600 }, { "epoch": 0.35228564096924375, "grad_norm": 4.1250041287694685, "learning_rate": 7.254595956762053e-07, "loss": 1.3135, "step": 9610 }, { "epoch": 0.35265222332196927, "grad_norm": 3.132058137932181, "learning_rate": 7.249445397536686e-07, "loss": 1.3349, "step": 9620 }, { "epoch": 0.35301880567469485, "grad_norm": 3.399519224329254, "learning_rate": 7.244291843897839e-07, "loss": 1.3052, "step": 9630 }, { "epoch": 0.35338538802742037, "grad_norm": 4.712619284275666, "learning_rate": 7.239135302705816e-07, "loss": 1.3065, "step": 9640 }, { "epoch": 0.3537519703801459, "grad_norm": 3.734161433235809, "learning_rate": 7.23397578082489e-07, "loss": 1.3094, "step": 9650 }, { "epoch": 0.35411855273287146, "grad_norm": 5.100823292959423, "learning_rate": 7.228813285123308e-07, "loss": 1.3331, "step": 9660 }, { "epoch": 0.354485135085597, "grad_norm": 4.534677424827633, "learning_rate": 7.223647822473271e-07, "loss": 1.3912, "step": 9670 }, { "epoch": 0.3548517174383225, "grad_norm": 3.470979394380451, "learning_rate": 7.218479399750934e-07, "loss": 1.3476, "step": 9680 }, { "epoch": 0.3552182997910481, "grad_norm": 4.753775104454421, "learning_rate": 7.21330802383639e-07, "loss": 1.3167, "step": 9690 }, { "epoch": 0.3555848821437736, "grad_norm": 3.412263014571041, "learning_rate": 7.208133701613665e-07, "loss": 1.3358, "step": 9700 }, { "epoch": 0.3559514644964991, "grad_norm": 4.131601355517602, "learning_rate": 7.202956439970704e-07, "loss": 1.3244, "step": 9710 }, { "epoch": 0.3563180468492247, "grad_norm": 5.122163472630932, "learning_rate": 7.197776245799367e-07, "loss": 1.2796, "step": 9720 }, { "epoch": 0.3566846292019502, "grad_norm": 5.335391466451254, "learning_rate": 7.192593125995418e-07, "loss": 1.3161, "step": 9730 }, { "epoch": 0.3570512115546758, "grad_norm": 4.103339016303858, "learning_rate": 7.187407087458518e-07, "loss": 1.4146, "step": 9740 }, { "epoch": 0.3574177939074013, "grad_norm": 5.904708913785668, "learning_rate": 7.182218137092204e-07, "loss": 1.3092, "step": 9750 }, { "epoch": 0.3577843762601268, "grad_norm": 4.187532290173183, "learning_rate": 7.1770262818039e-07, "loss": 1.2946, "step": 9760 }, { "epoch": 0.3581509586128524, "grad_norm": 4.6467762537942, "learning_rate": 7.17183152850489e-07, "loss": 1.3212, "step": 9770 }, { "epoch": 0.3585175409655779, "grad_norm": 4.424491675585427, "learning_rate": 7.16663388411032e-07, "loss": 1.3167, "step": 9780 }, { "epoch": 0.35888412331830344, "grad_norm": 4.460602913760459, "learning_rate": 7.161433355539181e-07, "loss": 1.3514, "step": 9790 }, { "epoch": 0.359250705671029, "grad_norm": 7.380392542181771, "learning_rate": 7.156229949714307e-07, "loss": 1.305, "step": 9800 }, { "epoch": 0.35961728802375453, "grad_norm": 3.677155226574757, "learning_rate": 7.15102367356236e-07, "loss": 1.3175, "step": 9810 }, { "epoch": 0.35998387037648005, "grad_norm": 2.995203775176967, "learning_rate": 7.145814534013821e-07, "loss": 1.3833, "step": 9820 }, { "epoch": 0.36035045272920563, "grad_norm": 3.5086546677463364, "learning_rate": 7.140602538002989e-07, "loss": 1.3858, "step": 9830 }, { "epoch": 0.36071703508193115, "grad_norm": 3.523795917156669, "learning_rate": 7.135387692467957e-07, "loss": 1.3375, "step": 9840 }, { "epoch": 0.3610836174346567, "grad_norm": 3.7313877963514, "learning_rate": 7.130170004350617e-07, "loss": 1.3094, "step": 9850 }, { "epoch": 0.36145019978738224, "grad_norm": 4.442532041857861, "learning_rate": 7.124949480596644e-07, "loss": 1.3121, "step": 9860 }, { "epoch": 0.36181678214010776, "grad_norm": 5.641090705197642, "learning_rate": 7.119726128155487e-07, "loss": 1.3387, "step": 9870 }, { "epoch": 0.36218336449283334, "grad_norm": 9.369536303911914, "learning_rate": 7.114499953980362e-07, "loss": 1.3413, "step": 9880 }, { "epoch": 0.36254994684555886, "grad_norm": 4.32109030408511, "learning_rate": 7.109270965028238e-07, "loss": 1.3636, "step": 9890 }, { "epoch": 0.3629165291982844, "grad_norm": 6.871086039775216, "learning_rate": 7.104039168259834e-07, "loss": 1.352, "step": 9900 }, { "epoch": 0.36328311155100995, "grad_norm": 4.509944406939018, "learning_rate": 7.098804570639605e-07, "loss": 1.2874, "step": 9910 }, { "epoch": 0.36364969390373547, "grad_norm": 4.612863347134658, "learning_rate": 7.093567179135738e-07, "loss": 1.2676, "step": 9920 }, { "epoch": 0.364016276256461, "grad_norm": 4.091094769005595, "learning_rate": 7.088327000720131e-07, "loss": 1.3038, "step": 9930 }, { "epoch": 0.36438285860918657, "grad_norm": 4.977334963231582, "learning_rate": 7.083084042368401e-07, "loss": 1.3008, "step": 9940 }, { "epoch": 0.3647494409619121, "grad_norm": 5.166826475680081, "learning_rate": 7.077838311059862e-07, "loss": 1.2881, "step": 9950 }, { "epoch": 0.36511602331463766, "grad_norm": 4.01832965003142, "learning_rate": 7.072589813777518e-07, "loss": 1.3523, "step": 9960 }, { "epoch": 0.3654826056673632, "grad_norm": 3.8045628665321214, "learning_rate": 7.067338557508055e-07, "loss": 1.3155, "step": 9970 }, { "epoch": 0.3658491880200887, "grad_norm": 4.344284713227578, "learning_rate": 7.062084549241833e-07, "loss": 1.3314, "step": 9980 }, { "epoch": 0.3662157703728143, "grad_norm": 4.559382806632024, "learning_rate": 7.056827795972876e-07, "loss": 1.3242, "step": 9990 }, { "epoch": 0.3665823527255398, "grad_norm": 8.960735940046002, "learning_rate": 7.051568304698862e-07, "loss": 1.2563, "step": 10000 }, { "epoch": 0.3665823527255398, "eval_accuracy": 0.7009188125309459, "eval_loss": 1.3158118724822998, "eval_runtime": 311.2198, "eval_samples_per_second": 10.626, "eval_steps_per_second": 0.887, "step": 10000 } ], "logging_steps": 10, "max_steps": 27279, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1085213557587968.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }