|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.3665823527255398, |
|
"eval_steps": 1000, |
|
"global_step": 10000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0003665823527255398, |
|
"grad_norm": 0.8778485808644284, |
|
"learning_rate": 2e-07, |
|
"loss": 2.1465, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0007331647054510796, |
|
"grad_norm": 1.0370696683685088, |
|
"learning_rate": 4e-07, |
|
"loss": 2.1972, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0010997470581766194, |
|
"grad_norm": 1.0006676078231553, |
|
"learning_rate": 6e-07, |
|
"loss": 2.1582, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0014663294109021592, |
|
"grad_norm": 0.8854477289760336, |
|
"learning_rate": 8e-07, |
|
"loss": 2.1934, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.001832911763627699, |
|
"grad_norm": 0.8999727006888211, |
|
"learning_rate": 1e-06, |
|
"loss": 2.1904, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.002199494116353239, |
|
"grad_norm": 0.932364223152173, |
|
"learning_rate": 9.999996672053607e-07, |
|
"loss": 2.1706, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0025660764690787785, |
|
"grad_norm": 1.0299012086021375, |
|
"learning_rate": 9.999986688218858e-07, |
|
"loss": 2.1958, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0029326588218043185, |
|
"grad_norm": 0.9395158606106717, |
|
"learning_rate": 9.999970048509042e-07, |
|
"loss": 2.2273, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.003299241174529858, |
|
"grad_norm": 0.9869960358591985, |
|
"learning_rate": 9.999946752946311e-07, |
|
"loss": 2.1807, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.003665823527255398, |
|
"grad_norm": 0.98825421384792, |
|
"learning_rate": 9.999916801561675e-07, |
|
"loss": 2.1348, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004032405879980938, |
|
"grad_norm": 1.1988395000442367, |
|
"learning_rate": 9.999880194395004e-07, |
|
"loss": 2.1377, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.004398988232706478, |
|
"grad_norm": 1.129064025809237, |
|
"learning_rate": 9.99983693149503e-07, |
|
"loss": 2.1565, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.004765570585432017, |
|
"grad_norm": 1.0050118479797396, |
|
"learning_rate": 9.999787012919342e-07, |
|
"loss": 2.1701, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.005132152938157557, |
|
"grad_norm": 0.9232759625522824, |
|
"learning_rate": 9.999730438734393e-07, |
|
"loss": 2.0963, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0054987352908830965, |
|
"grad_norm": 1.0348403490845175, |
|
"learning_rate": 9.999667209015492e-07, |
|
"loss": 2.1989, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.005865317643608637, |
|
"grad_norm": 1.0493408122676058, |
|
"learning_rate": 9.999597323846806e-07, |
|
"loss": 2.1707, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0062318999963341766, |
|
"grad_norm": 1.116513730433909, |
|
"learning_rate": 9.99952078332137e-07, |
|
"loss": 2.1614, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.006598482349059716, |
|
"grad_norm": 0.9558367370618089, |
|
"learning_rate": 9.999437587541072e-07, |
|
"loss": 2.1214, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.006965064701785256, |
|
"grad_norm": 1.0990453159310916, |
|
"learning_rate": 9.999347736616657e-07, |
|
"loss": 2.1514, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.007331647054510796, |
|
"grad_norm": 1.051146838955259, |
|
"learning_rate": 9.999251230667734e-07, |
|
"loss": 2.1672, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.007698229407236336, |
|
"grad_norm": 1.0528334484392676, |
|
"learning_rate": 9.99914806982277e-07, |
|
"loss": 2.1651, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.008064811759961876, |
|
"grad_norm": 1.0488001209067876, |
|
"learning_rate": 9.999038254219094e-07, |
|
"loss": 2.1269, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.008431394112687415, |
|
"grad_norm": 1.0423933094923075, |
|
"learning_rate": 9.998921784002884e-07, |
|
"loss": 2.1409, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.008797976465412955, |
|
"grad_norm": 1.2035163212207243, |
|
"learning_rate": 9.998798659329188e-07, |
|
"loss": 2.0949, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.009164558818138494, |
|
"grad_norm": 1.0311622443925152, |
|
"learning_rate": 9.998668880361902e-07, |
|
"loss": 2.1572, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.009531141170864035, |
|
"grad_norm": 1.0199238986570556, |
|
"learning_rate": 9.99853244727379e-07, |
|
"loss": 2.0908, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.009897723523589575, |
|
"grad_norm": 1.1052910194491554, |
|
"learning_rate": 9.998389360246465e-07, |
|
"loss": 2.1046, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.010264305876315114, |
|
"grad_norm": 1.0244380828171549, |
|
"learning_rate": 9.998239619470404e-07, |
|
"loss": 2.1351, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.010630888229040654, |
|
"grad_norm": 1.0080176905815665, |
|
"learning_rate": 9.998083225144936e-07, |
|
"loss": 2.089, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.010997470581766193, |
|
"grad_norm": 0.9588881775099163, |
|
"learning_rate": 9.997920177478252e-07, |
|
"loss": 2.0186, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.011364052934491733, |
|
"grad_norm": 1.0223619251237732, |
|
"learning_rate": 9.997750476687394e-07, |
|
"loss": 2.0966, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.011730635287217274, |
|
"grad_norm": 1.1940399230837102, |
|
"learning_rate": 9.99757412299827e-07, |
|
"loss": 2.1036, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.012097217639942813, |
|
"grad_norm": 0.9943487033980454, |
|
"learning_rate": 9.997391116645635e-07, |
|
"loss": 2.0628, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.012463799992668353, |
|
"grad_norm": 1.03891573450971, |
|
"learning_rate": 9.997201457873104e-07, |
|
"loss": 2.0691, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.012830382345393894, |
|
"grad_norm": 1.116344520158988, |
|
"learning_rate": 9.997005146933144e-07, |
|
"loss": 2.0524, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.013196964698119432, |
|
"grad_norm": 0.9966017657422209, |
|
"learning_rate": 9.996802184087082e-07, |
|
"loss": 2.0779, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.013563547050844973, |
|
"grad_norm": 1.0412743923430994, |
|
"learning_rate": 9.996592569605099e-07, |
|
"loss": 2.0376, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.013930129403570512, |
|
"grad_norm": 1.1118998023014073, |
|
"learning_rate": 9.996376303766227e-07, |
|
"loss": 2.015, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.014296711756296052, |
|
"grad_norm": 1.0325566872435106, |
|
"learning_rate": 9.996153386858355e-07, |
|
"loss": 2.0249, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.014663294109021592, |
|
"grad_norm": 0.9345504257678122, |
|
"learning_rate": 9.995923819178226e-07, |
|
"loss": 2.0451, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.015029876461747131, |
|
"grad_norm": 0.8875269101106378, |
|
"learning_rate": 9.995687601031435e-07, |
|
"loss": 2.0108, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.015396458814472672, |
|
"grad_norm": 1.0784341870798066, |
|
"learning_rate": 9.99544473273243e-07, |
|
"loss": 2.0201, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.015763041167198212, |
|
"grad_norm": 0.9379135038421763, |
|
"learning_rate": 9.995195214604515e-07, |
|
"loss": 1.941, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.016129623519923753, |
|
"grad_norm": 0.9126909079244707, |
|
"learning_rate": 9.994939046979838e-07, |
|
"loss": 1.9684, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.01649620587264929, |
|
"grad_norm": 0.8838022442791796, |
|
"learning_rate": 9.994676230199407e-07, |
|
"loss": 2.0389, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.01686278822537483, |
|
"grad_norm": 0.8836839199930503, |
|
"learning_rate": 9.994406764613082e-07, |
|
"loss": 1.9666, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.01722937057810037, |
|
"grad_norm": 1.0627568898996331, |
|
"learning_rate": 9.994130650579563e-07, |
|
"loss": 2.0156, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.01759595293082591, |
|
"grad_norm": 0.9141641845780258, |
|
"learning_rate": 9.993847888466408e-07, |
|
"loss": 1.9649, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.01796253528355145, |
|
"grad_norm": 0.9929808622960486, |
|
"learning_rate": 9.993558478650027e-07, |
|
"loss": 1.951, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.01832911763627699, |
|
"grad_norm": 0.9649106649125109, |
|
"learning_rate": 9.993262421515677e-07, |
|
"loss": 2.0194, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01869569998900253, |
|
"grad_norm": 0.9646184299435382, |
|
"learning_rate": 9.992959717457456e-07, |
|
"loss": 2.0054, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.01906228234172807, |
|
"grad_norm": 0.9754107205971403, |
|
"learning_rate": 9.992650366878326e-07, |
|
"loss": 1.9614, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.01942886469445361, |
|
"grad_norm": 0.825876663123403, |
|
"learning_rate": 9.99233437019008e-07, |
|
"loss": 2.0141, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.01979544704717915, |
|
"grad_norm": 0.9898145517539251, |
|
"learning_rate": 9.992011727813372e-07, |
|
"loss": 1.9788, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.020162029399904687, |
|
"grad_norm": 1.1244188599069105, |
|
"learning_rate": 9.991682440177694e-07, |
|
"loss": 1.9034, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.020528611752630228, |
|
"grad_norm": 1.1497344942569774, |
|
"learning_rate": 9.991346507721387e-07, |
|
"loss": 1.9211, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.020895194105355768, |
|
"grad_norm": 0.9021316458842555, |
|
"learning_rate": 9.991003930891637e-07, |
|
"loss": 1.9182, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.02126177645808131, |
|
"grad_norm": 0.8307709564470201, |
|
"learning_rate": 9.990654710144475e-07, |
|
"loss": 1.9272, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.02162835881080685, |
|
"grad_norm": 0.8745951617052735, |
|
"learning_rate": 9.990298845944777e-07, |
|
"loss": 1.9499, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.021994941163532386, |
|
"grad_norm": 0.8243921045085457, |
|
"learning_rate": 9.98993633876626e-07, |
|
"loss": 1.9221, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.022361523516257927, |
|
"grad_norm": 0.9285168979863858, |
|
"learning_rate": 9.989567189091486e-07, |
|
"loss": 1.8804, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.022728105868983467, |
|
"grad_norm": 0.9675998606348684, |
|
"learning_rate": 9.98919139741186e-07, |
|
"loss": 1.9019, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.023094688221709007, |
|
"grad_norm": 0.8852104273861887, |
|
"learning_rate": 9.988808964227629e-07, |
|
"loss": 1.8772, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.023461270574434548, |
|
"grad_norm": 0.819719680853091, |
|
"learning_rate": 9.988419890047877e-07, |
|
"loss": 1.9171, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.023827852927160085, |
|
"grad_norm": 0.93140794342887, |
|
"learning_rate": 9.988024175390533e-07, |
|
"loss": 1.8467, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.024194435279885625, |
|
"grad_norm": 0.8360802933834758, |
|
"learning_rate": 9.987621820782363e-07, |
|
"loss": 1.9233, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.024561017632611166, |
|
"grad_norm": 0.8157180427592693, |
|
"learning_rate": 9.987212826758975e-07, |
|
"loss": 1.9473, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.024927599985336706, |
|
"grad_norm": 0.9793002573948607, |
|
"learning_rate": 9.98679719386481e-07, |
|
"loss": 1.8931, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.025294182338062247, |
|
"grad_norm": 0.8445420197840301, |
|
"learning_rate": 9.986374922653154e-07, |
|
"loss": 1.8686, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.025660764690787787, |
|
"grad_norm": 0.8584605142905422, |
|
"learning_rate": 9.985946013686119e-07, |
|
"loss": 1.8967, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.026027347043513324, |
|
"grad_norm": 0.98656156834715, |
|
"learning_rate": 9.985510467534664e-07, |
|
"loss": 1.8635, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.026393929396238865, |
|
"grad_norm": 0.9182458113746159, |
|
"learning_rate": 9.985068284778577e-07, |
|
"loss": 1.8693, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.026760511748964405, |
|
"grad_norm": 0.8330989668660308, |
|
"learning_rate": 9.984619466006485e-07, |
|
"loss": 1.8613, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.027127094101689946, |
|
"grad_norm": 0.8644736624360776, |
|
"learning_rate": 9.98416401181584e-07, |
|
"loss": 1.8628, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.027493676454415486, |
|
"grad_norm": 0.987168924150431, |
|
"learning_rate": 9.98370192281294e-07, |
|
"loss": 1.8943, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.027860258807141023, |
|
"grad_norm": 0.8720418625775509, |
|
"learning_rate": 9.983233199612903e-07, |
|
"loss": 1.9446, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.028226841159866563, |
|
"grad_norm": 0.7953663245922279, |
|
"learning_rate": 9.982757842839687e-07, |
|
"loss": 1.9014, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.028593423512592104, |
|
"grad_norm": 0.9296681817326182, |
|
"learning_rate": 9.98227585312607e-07, |
|
"loss": 1.8108, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.028960005865317644, |
|
"grad_norm": 0.8062000633701384, |
|
"learning_rate": 9.981787231113675e-07, |
|
"loss": 1.8345, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.029326588218043185, |
|
"grad_norm": 0.7938194156111642, |
|
"learning_rate": 9.981291977452939e-07, |
|
"loss": 1.8941, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.029693170570768722, |
|
"grad_norm": 0.9291321405470028, |
|
"learning_rate": 9.980790092803135e-07, |
|
"loss": 1.8403, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.030059752923494262, |
|
"grad_norm": 0.8275423223500764, |
|
"learning_rate": 9.980281577832363e-07, |
|
"loss": 1.8402, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.030426335276219803, |
|
"grad_norm": 0.8980283349268403, |
|
"learning_rate": 9.979766433217545e-07, |
|
"loss": 1.8691, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.030792917628945343, |
|
"grad_norm": 0.7768796883189981, |
|
"learning_rate": 9.979244659644429e-07, |
|
"loss": 1.888, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.031159499981670884, |
|
"grad_norm": 0.818398169635764, |
|
"learning_rate": 9.978716257807593e-07, |
|
"loss": 1.8814, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.031526082334396424, |
|
"grad_norm": 0.8442121417280394, |
|
"learning_rate": 9.97818122841043e-07, |
|
"loss": 1.8369, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.031892664687121965, |
|
"grad_norm": 0.8176757534156489, |
|
"learning_rate": 9.977639572165162e-07, |
|
"loss": 1.8591, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.032259247039847505, |
|
"grad_norm": 0.8029579269470367, |
|
"learning_rate": 9.97709128979283e-07, |
|
"loss": 1.8866, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.03262582939257304, |
|
"grad_norm": 0.8812915944662771, |
|
"learning_rate": 9.976536382023294e-07, |
|
"loss": 1.8366, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.03299241174529858, |
|
"grad_norm": 0.777876054228082, |
|
"learning_rate": 9.97597484959524e-07, |
|
"loss": 1.8322, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.03335899409802412, |
|
"grad_norm": 0.9073927568433396, |
|
"learning_rate": 9.975406693256162e-07, |
|
"loss": 1.8238, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.03372557645074966, |
|
"grad_norm": 1.154230547383887, |
|
"learning_rate": 9.974831913762382e-07, |
|
"loss": 1.8574, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.0340921588034752, |
|
"grad_norm": 0.8196714978615802, |
|
"learning_rate": 9.974250511879031e-07, |
|
"loss": 1.8423, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.03445874115620074, |
|
"grad_norm": 0.9288752746341313, |
|
"learning_rate": 9.97366248838006e-07, |
|
"loss": 1.8993, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.03482532350892628, |
|
"grad_norm": 0.7950657259868453, |
|
"learning_rate": 9.973067844048235e-07, |
|
"loss": 1.8741, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.03519190586165182, |
|
"grad_norm": 0.796086365915343, |
|
"learning_rate": 9.972466579675131e-07, |
|
"loss": 1.7832, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.03555848821437736, |
|
"grad_norm": 0.9066172708399791, |
|
"learning_rate": 9.97185869606114e-07, |
|
"loss": 1.8462, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.0359250705671029, |
|
"grad_norm": 1.038083569499433, |
|
"learning_rate": 9.971244194015463e-07, |
|
"loss": 1.858, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.036291652919828436, |
|
"grad_norm": 0.9051533251684815, |
|
"learning_rate": 9.97062307435611e-07, |
|
"loss": 1.8387, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.03665823527255398, |
|
"grad_norm": 0.8381523935993735, |
|
"learning_rate": 9.969995337909908e-07, |
|
"loss": 1.8361, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03665823527255398, |
|
"eval_accuracy": 0.5988169778677517, |
|
"eval_loss": 1.8318405151367188, |
|
"eval_runtime": 308.5555, |
|
"eval_samples_per_second": 10.718, |
|
"eval_steps_per_second": 0.894, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03702481762527952, |
|
"grad_norm": 0.8427628207388767, |
|
"learning_rate": 9.969360985512478e-07, |
|
"loss": 1.8265, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.03739139997800506, |
|
"grad_norm": 0.8552215254960128, |
|
"learning_rate": 9.968720018008264e-07, |
|
"loss": 1.858, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.0377579823307306, |
|
"grad_norm": 0.9770990446912831, |
|
"learning_rate": 9.968072436250502e-07, |
|
"loss": 1.8336, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.03812456468345614, |
|
"grad_norm": 0.8749109462328284, |
|
"learning_rate": 9.967418241101245e-07, |
|
"loss": 1.8659, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.03849114703618168, |
|
"grad_norm": 1.0370092544039358, |
|
"learning_rate": 9.966757433431338e-07, |
|
"loss": 1.7817, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.03885772938890722, |
|
"grad_norm": 0.9115228378829131, |
|
"learning_rate": 9.966090014120439e-07, |
|
"loss": 1.8024, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.03922431174163276, |
|
"grad_norm": 0.8868427346212977, |
|
"learning_rate": 9.965415984056998e-07, |
|
"loss": 1.8437, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.0395908940943583, |
|
"grad_norm": 0.9053364161480404, |
|
"learning_rate": 9.96473534413827e-07, |
|
"loss": 1.817, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.039957476447083834, |
|
"grad_norm": 0.9133195528454671, |
|
"learning_rate": 9.964048095270312e-07, |
|
"loss": 1.7877, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.040324058799809374, |
|
"grad_norm": 1.0646101033232054, |
|
"learning_rate": 9.963354238367971e-07, |
|
"loss": 1.784, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.040690641152534915, |
|
"grad_norm": 0.7708104862115812, |
|
"learning_rate": 9.962653774354897e-07, |
|
"loss": 1.8534, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.041057223505260455, |
|
"grad_norm": 0.8675790148592712, |
|
"learning_rate": 9.96194670416353e-07, |
|
"loss": 1.8549, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.041423805857985996, |
|
"grad_norm": 0.8417668918121122, |
|
"learning_rate": 9.961233028735107e-07, |
|
"loss": 1.816, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.041790388210711536, |
|
"grad_norm": 0.8168288703880237, |
|
"learning_rate": 9.960512749019661e-07, |
|
"loss": 1.8512, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.04215697056343708, |
|
"grad_norm": 0.8018545416660454, |
|
"learning_rate": 9.95978586597601e-07, |
|
"loss": 1.832, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.04252355291616262, |
|
"grad_norm": 0.9865966895727584, |
|
"learning_rate": 9.959052380571764e-07, |
|
"loss": 1.853, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.04289013526888816, |
|
"grad_norm": 0.8107907928839149, |
|
"learning_rate": 9.958312293783327e-07, |
|
"loss": 1.85, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.0432567176216137, |
|
"grad_norm": 0.9230676080344427, |
|
"learning_rate": 9.957565606595882e-07, |
|
"loss": 1.7839, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.04362329997433924, |
|
"grad_norm": 0.9011134249108275, |
|
"learning_rate": 9.956812320003407e-07, |
|
"loss": 1.7649, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.04398988232706477, |
|
"grad_norm": 0.8877055310067349, |
|
"learning_rate": 9.956052435008657e-07, |
|
"loss": 1.8358, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.04435646467979031, |
|
"grad_norm": 0.9441745533847735, |
|
"learning_rate": 9.955285952623177e-07, |
|
"loss": 1.8217, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.04472304703251585, |
|
"grad_norm": 0.9280531244485228, |
|
"learning_rate": 9.954512873867292e-07, |
|
"loss": 1.8273, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.04508962938524139, |
|
"grad_norm": 1.0733510489183336, |
|
"learning_rate": 9.95373319977011e-07, |
|
"loss": 1.8289, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.045456211737966934, |
|
"grad_norm": 0.9194393203848475, |
|
"learning_rate": 9.952946931369512e-07, |
|
"loss": 1.8134, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.045822794090692474, |
|
"grad_norm": 0.8924651164337065, |
|
"learning_rate": 9.952154069712164e-07, |
|
"loss": 1.8233, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.046189376443418015, |
|
"grad_norm": 0.9645620934573451, |
|
"learning_rate": 9.951354615853506e-07, |
|
"loss": 1.7951, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.046555958796143555, |
|
"grad_norm": 0.9514951845878826, |
|
"learning_rate": 9.950548570857755e-07, |
|
"loss": 1.8034, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.046922541148869096, |
|
"grad_norm": 1.0861848487934576, |
|
"learning_rate": 9.949735935797898e-07, |
|
"loss": 1.7845, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.047289123501594636, |
|
"grad_norm": 0.9444165617124335, |
|
"learning_rate": 9.948916711755702e-07, |
|
"loss": 1.8499, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.04765570585432017, |
|
"grad_norm": 0.9296489213610688, |
|
"learning_rate": 9.948090899821695e-07, |
|
"loss": 1.8362, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.04802228820704571, |
|
"grad_norm": 0.9031404187157595, |
|
"learning_rate": 9.947258501095183e-07, |
|
"loss": 1.7987, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.04838887055977125, |
|
"grad_norm": 0.9893576898507132, |
|
"learning_rate": 9.946419516684238e-07, |
|
"loss": 1.7901, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.04875545291249679, |
|
"grad_norm": 0.8312432281714202, |
|
"learning_rate": 9.945573947705696e-07, |
|
"loss": 1.7877, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.04912203526522233, |
|
"grad_norm": 0.9503234488792208, |
|
"learning_rate": 9.944721795285161e-07, |
|
"loss": 1.7814, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.04948861761794787, |
|
"grad_norm": 0.8138144516056374, |
|
"learning_rate": 9.943863060557e-07, |
|
"loss": 1.7973, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.04985519997067341, |
|
"grad_norm": 1.0236050868655204, |
|
"learning_rate": 9.942997744664346e-07, |
|
"loss": 1.766, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.05022178232339895, |
|
"grad_norm": 0.8876253030811799, |
|
"learning_rate": 9.942125848759084e-07, |
|
"loss": 1.8025, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.05058836467612449, |
|
"grad_norm": 0.9143837255426513, |
|
"learning_rate": 9.941247374001864e-07, |
|
"loss": 1.8256, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.050954947028850034, |
|
"grad_norm": 0.7919956208916636, |
|
"learning_rate": 9.940362321562095e-07, |
|
"loss": 1.7966, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.051321529381575574, |
|
"grad_norm": 0.9593927463945575, |
|
"learning_rate": 9.939470692617936e-07, |
|
"loss": 1.756, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.05168811173430111, |
|
"grad_norm": 1.0264148022637987, |
|
"learning_rate": 9.938572488356309e-07, |
|
"loss": 1.7938, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.05205469408702665, |
|
"grad_norm": 1.0694910008156386, |
|
"learning_rate": 9.937667709972882e-07, |
|
"loss": 1.7151, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.05242127643975219, |
|
"grad_norm": 1.106949179035861, |
|
"learning_rate": 9.936756358672075e-07, |
|
"loss": 1.7566, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.05278785879247773, |
|
"grad_norm": 0.8484995009187619, |
|
"learning_rate": 9.935838435667062e-07, |
|
"loss": 1.8061, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.05315444114520327, |
|
"grad_norm": 0.9442924790988804, |
|
"learning_rate": 9.93491394217976e-07, |
|
"loss": 1.7938, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.05352102349792881, |
|
"grad_norm": 0.8835040984395444, |
|
"learning_rate": 9.933982879440838e-07, |
|
"loss": 1.7801, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.05388760585065435, |
|
"grad_norm": 0.951681021528121, |
|
"learning_rate": 9.933045248689704e-07, |
|
"loss": 1.7839, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.05425418820337989, |
|
"grad_norm": 0.8986214443009446, |
|
"learning_rate": 9.932101051174513e-07, |
|
"loss": 1.8251, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.05462077055610543, |
|
"grad_norm": 0.8136477078651573, |
|
"learning_rate": 9.93115028815216e-07, |
|
"loss": 1.8429, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.05498735290883097, |
|
"grad_norm": 1.0031260237221131, |
|
"learning_rate": 9.93019296088828e-07, |
|
"loss": 1.7663, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.055353935261556506, |
|
"grad_norm": 0.9959012828848206, |
|
"learning_rate": 9.92922907065725e-07, |
|
"loss": 1.8269, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.055720517614282046, |
|
"grad_norm": 0.8915575658825868, |
|
"learning_rate": 9.928258618742176e-07, |
|
"loss": 1.7696, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.056087099967007586, |
|
"grad_norm": 0.9963782636445598, |
|
"learning_rate": 9.927281606434902e-07, |
|
"loss": 1.7738, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.05645368231973313, |
|
"grad_norm": 0.9381564546633785, |
|
"learning_rate": 9.92629803503601e-07, |
|
"loss": 1.7333, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.05682026467245867, |
|
"grad_norm": 1.0017202007335113, |
|
"learning_rate": 9.925307905854807e-07, |
|
"loss": 1.8095, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.05718684702518421, |
|
"grad_norm": 1.0543725728983615, |
|
"learning_rate": 9.924311220209332e-07, |
|
"loss": 1.7571, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.05755342937790975, |
|
"grad_norm": 1.0455383232236297, |
|
"learning_rate": 9.92330797942635e-07, |
|
"loss": 1.7605, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.05792001173063529, |
|
"grad_norm": 0.8416991518569622, |
|
"learning_rate": 9.922298184841356e-07, |
|
"loss": 1.7703, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.05828659408336083, |
|
"grad_norm": 0.92044213042727, |
|
"learning_rate": 9.921281837798565e-07, |
|
"loss": 1.7051, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.05865317643608637, |
|
"grad_norm": 0.9422384532621354, |
|
"learning_rate": 9.920258939650918e-07, |
|
"loss": 1.7882, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.0590197587888119, |
|
"grad_norm": 1.1464397608985724, |
|
"learning_rate": 9.919229491760074e-07, |
|
"loss": 1.7504, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.059386341141537444, |
|
"grad_norm": 1.1503410560007548, |
|
"learning_rate": 9.918193495496411e-07, |
|
"loss": 1.7755, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.059752923494262984, |
|
"grad_norm": 1.034854775422536, |
|
"learning_rate": 9.917150952239028e-07, |
|
"loss": 1.8109, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.060119505846988525, |
|
"grad_norm": 0.9357240877838402, |
|
"learning_rate": 9.916101863375734e-07, |
|
"loss": 1.812, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.060486088199714065, |
|
"grad_norm": 1.2613406348730127, |
|
"learning_rate": 9.915046230303055e-07, |
|
"loss": 1.7299, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.060852670552439606, |
|
"grad_norm": 0.991269818479319, |
|
"learning_rate": 9.913984054426226e-07, |
|
"loss": 1.6839, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.061219252905165146, |
|
"grad_norm": 1.0426302229265827, |
|
"learning_rate": 9.91291533715919e-07, |
|
"loss": 1.6983, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.061585835257890686, |
|
"grad_norm": 1.0623577818006307, |
|
"learning_rate": 9.911840079924607e-07, |
|
"loss": 1.7586, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.06195241761061623, |
|
"grad_norm": 0.9792793493189645, |
|
"learning_rate": 9.910758284153834e-07, |
|
"loss": 1.7863, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.06231899996334177, |
|
"grad_norm": 1.1013133546227525, |
|
"learning_rate": 9.90966995128693e-07, |
|
"loss": 1.7586, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.0626855823160673, |
|
"grad_norm": 1.2653001609685381, |
|
"learning_rate": 9.908575082772664e-07, |
|
"loss": 1.7087, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.06305216466879285, |
|
"grad_norm": 1.2600949114865185, |
|
"learning_rate": 9.907473680068501e-07, |
|
"loss": 1.6974, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.06341874702151838, |
|
"grad_norm": 1.0352843166386823, |
|
"learning_rate": 9.906365744640605e-07, |
|
"loss": 1.7247, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.06378532937424393, |
|
"grad_norm": 1.0534586823177523, |
|
"learning_rate": 9.905251277963838e-07, |
|
"loss": 1.7989, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.06415191172696946, |
|
"grad_norm": 1.0901888662447625, |
|
"learning_rate": 9.904130281521749e-07, |
|
"loss": 1.7495, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.06451849407969501, |
|
"grad_norm": 1.0657237836075932, |
|
"learning_rate": 9.903002756806589e-07, |
|
"loss": 1.7393, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.06488507643242054, |
|
"grad_norm": 1.0695629454280169, |
|
"learning_rate": 9.901868705319291e-07, |
|
"loss": 1.784, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.06525165878514608, |
|
"grad_norm": 0.9206279700392275, |
|
"learning_rate": 9.900728128569482e-07, |
|
"loss": 1.758, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.06561824113787162, |
|
"grad_norm": 1.0410164391482535, |
|
"learning_rate": 9.899581028075473e-07, |
|
"loss": 1.7252, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.06598482349059716, |
|
"grad_norm": 0.9377493357256449, |
|
"learning_rate": 9.898427405364262e-07, |
|
"loss": 1.74, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.0663514058433227, |
|
"grad_norm": 1.1272971880737597, |
|
"learning_rate": 9.897267261971524e-07, |
|
"loss": 1.7524, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.06671798819604824, |
|
"grad_norm": 1.0979559562270786, |
|
"learning_rate": 9.896100599441618e-07, |
|
"loss": 1.6988, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.06708457054877379, |
|
"grad_norm": 0.961855276743755, |
|
"learning_rate": 9.894927419327576e-07, |
|
"loss": 1.7327, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.06745115290149932, |
|
"grad_norm": 0.97235897562474, |
|
"learning_rate": 9.893747723191118e-07, |
|
"loss": 1.7544, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.06781773525422487, |
|
"grad_norm": 1.1764451813427488, |
|
"learning_rate": 9.892561512602626e-07, |
|
"loss": 1.7616, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.0681843176069504, |
|
"grad_norm": 0.9690232157285822, |
|
"learning_rate": 9.891368789141158e-07, |
|
"loss": 1.7386, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.06855089995967593, |
|
"grad_norm": 1.131145797735988, |
|
"learning_rate": 9.89016955439444e-07, |
|
"loss": 1.7473, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.06891748231240148, |
|
"grad_norm": 1.1996910697441496, |
|
"learning_rate": 9.88896380995887e-07, |
|
"loss": 1.7502, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.06928406466512702, |
|
"grad_norm": 1.2280647210603344, |
|
"learning_rate": 9.887751557439513e-07, |
|
"loss": 1.7547, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.06965064701785256, |
|
"grad_norm": 1.0705375351848956, |
|
"learning_rate": 9.886532798450085e-07, |
|
"loss": 1.7577, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.0700172293705781, |
|
"grad_norm": 1.0083918166967278, |
|
"learning_rate": 9.88530753461298e-07, |
|
"loss": 1.7193, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.07038381172330364, |
|
"grad_norm": 1.0053388433251793, |
|
"learning_rate": 9.884075767559236e-07, |
|
"loss": 1.7635, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.07075039407602918, |
|
"grad_norm": 1.1405257537860627, |
|
"learning_rate": 9.88283749892856e-07, |
|
"loss": 1.7859, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.07111697642875472, |
|
"grad_norm": 1.3872222978621402, |
|
"learning_rate": 9.881592730369305e-07, |
|
"loss": 1.6823, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.07148355878148026, |
|
"grad_norm": 1.0500974949147595, |
|
"learning_rate": 9.880341463538483e-07, |
|
"loss": 1.7268, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.0718501411342058, |
|
"grad_norm": 1.1146107157958263, |
|
"learning_rate": 9.879083700101754e-07, |
|
"loss": 1.7324, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.07221672348693134, |
|
"grad_norm": 1.0782444093138666, |
|
"learning_rate": 9.877819441733421e-07, |
|
"loss": 1.7219, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.07258330583965687, |
|
"grad_norm": 1.1066515564824118, |
|
"learning_rate": 9.876548690116443e-07, |
|
"loss": 1.6974, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.07294988819238242, |
|
"grad_norm": 1.0551270004207765, |
|
"learning_rate": 9.875271446942416e-07, |
|
"loss": 1.7086, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.07331647054510795, |
|
"grad_norm": 1.0172022580059552, |
|
"learning_rate": 9.873987713911579e-07, |
|
"loss": 1.7281, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.07331647054510795, |
|
"eval_accuracy": 0.6153943652920695, |
|
"eval_loss": 1.7325148582458496, |
|
"eval_runtime": 307.9034, |
|
"eval_samples_per_second": 10.74, |
|
"eval_steps_per_second": 0.896, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.0736830528978335, |
|
"grad_norm": 1.0319650415221862, |
|
"learning_rate": 9.872697492732805e-07, |
|
"loss": 1.699, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.07404963525055903, |
|
"grad_norm": 0.9982774529316707, |
|
"learning_rate": 9.871400785123615e-07, |
|
"loss": 1.7476, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.07441621760328458, |
|
"grad_norm": 1.1272779709424325, |
|
"learning_rate": 9.870097592810156e-07, |
|
"loss": 1.7911, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.07478279995601012, |
|
"grad_norm": 1.0356947186293473, |
|
"learning_rate": 9.86878791752721e-07, |
|
"loss": 1.7038, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.07514938230873566, |
|
"grad_norm": 0.9227271241300935, |
|
"learning_rate": 9.867471761018187e-07, |
|
"loss": 1.789, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.0755159646614612, |
|
"grad_norm": 1.1484518524699514, |
|
"learning_rate": 9.86614912503513e-07, |
|
"loss": 1.7706, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.07588254701418674, |
|
"grad_norm": 0.8955923870076745, |
|
"learning_rate": 9.864820011338698e-07, |
|
"loss": 1.7543, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.07624912936691228, |
|
"grad_norm": 1.1335067807492596, |
|
"learning_rate": 9.863484421698182e-07, |
|
"loss": 1.7155, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.07661571171963781, |
|
"grad_norm": 1.1784649675887455, |
|
"learning_rate": 9.86214235789149e-07, |
|
"loss": 1.7198, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.07698229407236336, |
|
"grad_norm": 0.9990776315852751, |
|
"learning_rate": 9.860793821705153e-07, |
|
"loss": 1.7088, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.07734887642508889, |
|
"grad_norm": 1.8933737366748618, |
|
"learning_rate": 9.859438814934306e-07, |
|
"loss": 1.7815, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.07771545877781444, |
|
"grad_norm": 1.0824373033670114, |
|
"learning_rate": 9.858077339382708e-07, |
|
"loss": 1.7056, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.07808204113053997, |
|
"grad_norm": 1.0459040499217758, |
|
"learning_rate": 9.856709396862727e-07, |
|
"loss": 1.7587, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.07844862348326552, |
|
"grad_norm": 1.1273027866420589, |
|
"learning_rate": 9.855334989195338e-07, |
|
"loss": 1.6718, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.07881520583599105, |
|
"grad_norm": 1.1216307142085522, |
|
"learning_rate": 9.853954118210124e-07, |
|
"loss": 1.6925, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.0791817881887166, |
|
"grad_norm": 1.2320479842440668, |
|
"learning_rate": 9.852566785745269e-07, |
|
"loss": 1.7128, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.07954837054144213, |
|
"grad_norm": 1.0679388999130817, |
|
"learning_rate": 9.851172993647562e-07, |
|
"loss": 1.7063, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.07991495289416767, |
|
"grad_norm": 1.2733808120999472, |
|
"learning_rate": 9.849772743772387e-07, |
|
"loss": 1.69, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.08028153524689322, |
|
"grad_norm": 1.240045987921097, |
|
"learning_rate": 9.848366037983728e-07, |
|
"loss": 1.7382, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.08064811759961875, |
|
"grad_norm": 1.0370629833579919, |
|
"learning_rate": 9.846952878154162e-07, |
|
"loss": 1.7135, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.0810146999523443, |
|
"grad_norm": 1.1809158590474762, |
|
"learning_rate": 9.845533266164856e-07, |
|
"loss": 1.7197, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.08138128230506983, |
|
"grad_norm": 1.0143562772242192, |
|
"learning_rate": 9.844107203905567e-07, |
|
"loss": 1.7062, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.08174786465779538, |
|
"grad_norm": 1.1841441026483928, |
|
"learning_rate": 9.842674693274639e-07, |
|
"loss": 1.6766, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.08211444701052091, |
|
"grad_norm": 1.1281564379658906, |
|
"learning_rate": 9.841235736179e-07, |
|
"loss": 1.6485, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.08248102936324646, |
|
"grad_norm": 1.2660731034162191, |
|
"learning_rate": 9.83979033453416e-07, |
|
"loss": 1.7513, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.08284761171597199, |
|
"grad_norm": 1.1670722746985231, |
|
"learning_rate": 9.8383384902642e-07, |
|
"loss": 1.7282, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.08321419406869754, |
|
"grad_norm": 1.1924698170354644, |
|
"learning_rate": 9.836880205301795e-07, |
|
"loss": 1.7339, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.08358077642142307, |
|
"grad_norm": 1.0522491790203259, |
|
"learning_rate": 9.835415481588173e-07, |
|
"loss": 1.6907, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.0839473587741486, |
|
"grad_norm": 1.1650865835189006, |
|
"learning_rate": 9.83394432107315e-07, |
|
"loss": 1.718, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.08431394112687415, |
|
"grad_norm": 0.9881537861019963, |
|
"learning_rate": 9.832466725715097e-07, |
|
"loss": 1.7423, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.08468052347959969, |
|
"grad_norm": 1.0843420992658444, |
|
"learning_rate": 9.830982697480958e-07, |
|
"loss": 1.7112, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.08504710583232523, |
|
"grad_norm": 1.1947303847486304, |
|
"learning_rate": 9.829492238346244e-07, |
|
"loss": 1.6813, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.08541368818505077, |
|
"grad_norm": 1.04336555772043, |
|
"learning_rate": 9.82799535029502e-07, |
|
"loss": 1.6871, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.08578027053777632, |
|
"grad_norm": 1.3465243494238373, |
|
"learning_rate": 9.826492035319911e-07, |
|
"loss": 1.7358, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.08614685289050185, |
|
"grad_norm": 1.1173189734449491, |
|
"learning_rate": 9.824982295422097e-07, |
|
"loss": 1.7047, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.0865134352432274, |
|
"grad_norm": 1.2520018391632697, |
|
"learning_rate": 9.823466132611313e-07, |
|
"loss": 1.6984, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.08688001759595293, |
|
"grad_norm": 1.03470369404529, |
|
"learning_rate": 9.82194354890584e-07, |
|
"loss": 1.7278, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.08724659994867848, |
|
"grad_norm": 1.0164204083388344, |
|
"learning_rate": 9.820414546332513e-07, |
|
"loss": 1.7458, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.08761318230140401, |
|
"grad_norm": 1.2348821126024987, |
|
"learning_rate": 9.818879126926701e-07, |
|
"loss": 1.7343, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.08797976465412954, |
|
"grad_norm": 1.0011105767660962, |
|
"learning_rate": 9.817337292732328e-07, |
|
"loss": 1.7131, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.08834634700685509, |
|
"grad_norm": 1.0710762717577924, |
|
"learning_rate": 9.815789045801847e-07, |
|
"loss": 1.6617, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.08871292935958063, |
|
"grad_norm": 1.1055970569118785, |
|
"learning_rate": 9.814234388196252e-07, |
|
"loss": 1.758, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.08907951171230617, |
|
"grad_norm": 1.013594052614807, |
|
"learning_rate": 9.81267332198507e-07, |
|
"loss": 1.6906, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.0894460940650317, |
|
"grad_norm": 1.0649424099545044, |
|
"learning_rate": 9.811105849246359e-07, |
|
"loss": 1.6896, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.08981267641775725, |
|
"grad_norm": 1.7084885584877294, |
|
"learning_rate": 9.809531972066705e-07, |
|
"loss": 1.6614, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.09017925877048279, |
|
"grad_norm": 1.5758236147361129, |
|
"learning_rate": 9.807951692541217e-07, |
|
"loss": 1.6952, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.09054584112320833, |
|
"grad_norm": 1.3585874981966901, |
|
"learning_rate": 9.806365012773532e-07, |
|
"loss": 1.7113, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.09091242347593387, |
|
"grad_norm": 1.3061869321513975, |
|
"learning_rate": 9.804771934875807e-07, |
|
"loss": 1.6796, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.09127900582865942, |
|
"grad_norm": 1.1540286110201206, |
|
"learning_rate": 9.803172460968705e-07, |
|
"loss": 1.7097, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.09164558818138495, |
|
"grad_norm": 1.2915686809771951, |
|
"learning_rate": 9.80156659318142e-07, |
|
"loss": 1.7138, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.09201217053411048, |
|
"grad_norm": 1.1468908768097306, |
|
"learning_rate": 9.799954333651642e-07, |
|
"loss": 1.7038, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.09237875288683603, |
|
"grad_norm": 1.257655656482852, |
|
"learning_rate": 9.79833568452558e-07, |
|
"loss": 1.677, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.09274533523956156, |
|
"grad_norm": 1.6361492549326027, |
|
"learning_rate": 9.796710647957944e-07, |
|
"loss": 1.6155, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.09311191759228711, |
|
"grad_norm": 1.1505717408841072, |
|
"learning_rate": 9.795079226111949e-07, |
|
"loss": 1.6811, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.09347849994501264, |
|
"grad_norm": 1.1983166183129195, |
|
"learning_rate": 9.793441421159308e-07, |
|
"loss": 1.7203, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.09384508229773819, |
|
"grad_norm": 1.1985818933727272, |
|
"learning_rate": 9.79179723528023e-07, |
|
"loss": 1.7232, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.09421166465046373, |
|
"grad_norm": 1.0143700528752713, |
|
"learning_rate": 9.790146670663422e-07, |
|
"loss": 1.6916, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.09457824700318927, |
|
"grad_norm": 1.121117592417204, |
|
"learning_rate": 9.788489729506082e-07, |
|
"loss": 1.6683, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.0949448293559148, |
|
"grad_norm": 1.339002521581536, |
|
"learning_rate": 9.78682641401389e-07, |
|
"loss": 1.6622, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.09531141170864034, |
|
"grad_norm": 1.1212646774920143, |
|
"learning_rate": 9.785156726401019e-07, |
|
"loss": 1.687, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.09567799406136589, |
|
"grad_norm": 1.2061879994547406, |
|
"learning_rate": 9.78348066889012e-07, |
|
"loss": 1.6652, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.09604457641409142, |
|
"grad_norm": 1.225185884537581, |
|
"learning_rate": 9.781798243712326e-07, |
|
"loss": 1.6948, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.09641115876681697, |
|
"grad_norm": 1.0146497215382635, |
|
"learning_rate": 9.780109453107245e-07, |
|
"loss": 1.7009, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.0967777411195425, |
|
"grad_norm": 1.2171300466801498, |
|
"learning_rate": 9.77841429932296e-07, |
|
"loss": 1.7087, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.09714432347226805, |
|
"grad_norm": 1.0629828650910798, |
|
"learning_rate": 9.77671278461602e-07, |
|
"loss": 1.7316, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.09751090582499358, |
|
"grad_norm": 1.1754432625786018, |
|
"learning_rate": 9.775004911251448e-07, |
|
"loss": 1.6953, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.09787748817771913, |
|
"grad_norm": 1.3069724342535498, |
|
"learning_rate": 9.773290681502727e-07, |
|
"loss": 1.7057, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.09824407053044466, |
|
"grad_norm": 1.3314679455466842, |
|
"learning_rate": 9.7715700976518e-07, |
|
"loss": 1.6842, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.09861065288317021, |
|
"grad_norm": 1.3928937247531508, |
|
"learning_rate": 9.769843161989079e-07, |
|
"loss": 1.7052, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.09897723523589574, |
|
"grad_norm": 1.3389115391442472, |
|
"learning_rate": 9.768109876813417e-07, |
|
"loss": 1.6905, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.09934381758862128, |
|
"grad_norm": 1.2854315608533564, |
|
"learning_rate": 9.76637024443213e-07, |
|
"loss": 1.6806, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.09971039994134683, |
|
"grad_norm": 1.24293956575573, |
|
"learning_rate": 9.764624267160975e-07, |
|
"loss": 1.6922, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.10007698229407236, |
|
"grad_norm": 1.2809307536658918, |
|
"learning_rate": 9.762871947324165e-07, |
|
"loss": 1.7001, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.1004435646467979, |
|
"grad_norm": 1.1615070632030087, |
|
"learning_rate": 9.761113287254345e-07, |
|
"loss": 1.6747, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.10081014699952344, |
|
"grad_norm": 1.245140216818738, |
|
"learning_rate": 9.75934828929261e-07, |
|
"loss": 1.6469, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.10117672935224899, |
|
"grad_norm": 1.152316966014997, |
|
"learning_rate": 9.757576955788486e-07, |
|
"loss": 1.6773, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.10154331170497452, |
|
"grad_norm": 1.1064605629765938, |
|
"learning_rate": 9.755799289099932e-07, |
|
"loss": 1.6447, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.10190989405770007, |
|
"grad_norm": 1.1150499110452152, |
|
"learning_rate": 9.754015291593343e-07, |
|
"loss": 1.7168, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.1022764764104256, |
|
"grad_norm": 1.3016769905995789, |
|
"learning_rate": 9.752224965643536e-07, |
|
"loss": 1.7209, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.10264305876315115, |
|
"grad_norm": 1.332321427009131, |
|
"learning_rate": 9.750428313633757e-07, |
|
"loss": 1.6247, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.10300964111587668, |
|
"grad_norm": 1.311092146207188, |
|
"learning_rate": 9.748625337955667e-07, |
|
"loss": 1.6366, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.10337622346860222, |
|
"grad_norm": 1.1634742047900515, |
|
"learning_rate": 9.746816041009351e-07, |
|
"loss": 1.7143, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.10374280582132776, |
|
"grad_norm": 1.1916284602740692, |
|
"learning_rate": 9.745000425203307e-07, |
|
"loss": 1.6568, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.1041093881740533, |
|
"grad_norm": 1.280571751055567, |
|
"learning_rate": 9.743178492954442e-07, |
|
"loss": 1.6303, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.10447597052677884, |
|
"grad_norm": 1.3621017517970784, |
|
"learning_rate": 9.741350246688076e-07, |
|
"loss": 1.7569, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.10484255287950438, |
|
"grad_norm": 1.1019913075705825, |
|
"learning_rate": 9.739515688837927e-07, |
|
"loss": 1.6934, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.10520913523222993, |
|
"grad_norm": 1.3868159647800968, |
|
"learning_rate": 9.73767482184612e-07, |
|
"loss": 1.6267, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.10557571758495546, |
|
"grad_norm": 1.4881189853618986, |
|
"learning_rate": 9.73582764816318e-07, |
|
"loss": 1.7354, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.105942299937681, |
|
"grad_norm": 1.5118948532986631, |
|
"learning_rate": 9.733974170248025e-07, |
|
"loss": 1.6856, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.10630888229040654, |
|
"grad_norm": 1.4796154680218983, |
|
"learning_rate": 9.732114390567963e-07, |
|
"loss": 1.7045, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.10667546464313207, |
|
"grad_norm": 1.2560441338500297, |
|
"learning_rate": 9.730248311598694e-07, |
|
"loss": 1.6466, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.10704204699585762, |
|
"grad_norm": 1.1595828068992133, |
|
"learning_rate": 9.728375935824301e-07, |
|
"loss": 1.6822, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.10740862934858315, |
|
"grad_norm": 1.3126146065763922, |
|
"learning_rate": 9.726497265737252e-07, |
|
"loss": 1.6723, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.1077752117013087, |
|
"grad_norm": 1.2296488317137073, |
|
"learning_rate": 9.724612303838393e-07, |
|
"loss": 1.6647, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.10814179405403423, |
|
"grad_norm": 1.170972623285309, |
|
"learning_rate": 9.722721052636944e-07, |
|
"loss": 1.6955, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.10850837640675978, |
|
"grad_norm": 1.2633141406462256, |
|
"learning_rate": 9.720823514650495e-07, |
|
"loss": 1.6332, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.10887495875948532, |
|
"grad_norm": 1.2911934178837097, |
|
"learning_rate": 9.718919692405014e-07, |
|
"loss": 1.7218, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.10924154111221086, |
|
"grad_norm": 1.1657180939495957, |
|
"learning_rate": 9.717009588434822e-07, |
|
"loss": 1.6067, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.1096081234649364, |
|
"grad_norm": 1.239214562886889, |
|
"learning_rate": 9.715093205282615e-07, |
|
"loss": 1.7067, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.10997470581766194, |
|
"grad_norm": 1.3619661984646028, |
|
"learning_rate": 9.713170545499435e-07, |
|
"loss": 1.6978, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.10997470581766194, |
|
"eval_accuracy": 0.6262376782115725, |
|
"eval_loss": 1.6762739419937134, |
|
"eval_runtime": 309.1255, |
|
"eval_samples_per_second": 10.698, |
|
"eval_steps_per_second": 0.893, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.11034128817038748, |
|
"grad_norm": 1.2670499181513593, |
|
"learning_rate": 9.711241611644688e-07, |
|
"loss": 1.677, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.11070787052311301, |
|
"grad_norm": 1.2403940254412753, |
|
"learning_rate": 9.709306406286129e-07, |
|
"loss": 1.6604, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.11107445287583856, |
|
"grad_norm": 1.3312898520587448, |
|
"learning_rate": 9.707364931999864e-07, |
|
"loss": 1.6867, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.11144103522856409, |
|
"grad_norm": 1.3495930407749666, |
|
"learning_rate": 9.70541719137034e-07, |
|
"loss": 1.6617, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.11180761758128964, |
|
"grad_norm": 1.1396532709110236, |
|
"learning_rate": 9.703463186990346e-07, |
|
"loss": 1.7035, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.11217419993401517, |
|
"grad_norm": 1.2231802562577823, |
|
"learning_rate": 9.701502921461013e-07, |
|
"loss": 1.6723, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.11254078228674072, |
|
"grad_norm": 1.3403523967021675, |
|
"learning_rate": 9.699536397391806e-07, |
|
"loss": 1.6698, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.11290736463946625, |
|
"grad_norm": 1.3447918453958256, |
|
"learning_rate": 9.697563617400516e-07, |
|
"loss": 1.6716, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.1132739469921918, |
|
"grad_norm": 1.2969348535087712, |
|
"learning_rate": 9.695584584113267e-07, |
|
"loss": 1.6949, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.11364052934491733, |
|
"grad_norm": 1.1643584556065927, |
|
"learning_rate": 9.693599300164508e-07, |
|
"loss": 1.6713, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.11400711169764288, |
|
"grad_norm": 1.2242377804664155, |
|
"learning_rate": 9.691607768197002e-07, |
|
"loss": 1.6386, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.11437369405036842, |
|
"grad_norm": 1.319822492671326, |
|
"learning_rate": 9.689609990861837e-07, |
|
"loss": 1.6816, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.11474027640309395, |
|
"grad_norm": 1.3781452196212938, |
|
"learning_rate": 9.687605970818408e-07, |
|
"loss": 1.6784, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.1151068587558195, |
|
"grad_norm": 1.2168088100404522, |
|
"learning_rate": 9.68559571073443e-07, |
|
"loss": 1.6982, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.11547344110854503, |
|
"grad_norm": 1.4540401524570652, |
|
"learning_rate": 9.68357921328591e-07, |
|
"loss": 1.6718, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.11584002346127058, |
|
"grad_norm": 1.3143498063269197, |
|
"learning_rate": 9.681556481157171e-07, |
|
"loss": 1.6709, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.11620660581399611, |
|
"grad_norm": 1.1946622719420839, |
|
"learning_rate": 9.679527517040831e-07, |
|
"loss": 1.6747, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.11657318816672166, |
|
"grad_norm": 1.286257203814063, |
|
"learning_rate": 9.6774923236378e-07, |
|
"loss": 1.699, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.11693977051944719, |
|
"grad_norm": 1.3969179686751765, |
|
"learning_rate": 9.675450903657286e-07, |
|
"loss": 1.6228, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.11730635287217274, |
|
"grad_norm": 1.1607892230894732, |
|
"learning_rate": 9.673403259816787e-07, |
|
"loss": 1.6538, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.11767293522489827, |
|
"grad_norm": 1.4009629932701972, |
|
"learning_rate": 9.671349394842075e-07, |
|
"loss": 1.6401, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.1180395175776238, |
|
"grad_norm": 1.5024706182569632, |
|
"learning_rate": 9.669289311467216e-07, |
|
"loss": 1.6508, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.11840609993034935, |
|
"grad_norm": 1.9466998313668968, |
|
"learning_rate": 9.66722301243455e-07, |
|
"loss": 1.6662, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.11877268228307489, |
|
"grad_norm": 1.6928758946763174, |
|
"learning_rate": 9.665150500494686e-07, |
|
"loss": 1.681, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.11913926463580043, |
|
"grad_norm": 1.5050927792757436, |
|
"learning_rate": 9.66307177840651e-07, |
|
"loss": 1.6669, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.11950584698852597, |
|
"grad_norm": 1.179067981511082, |
|
"learning_rate": 9.66098684893717e-07, |
|
"loss": 1.6503, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.11987242934125152, |
|
"grad_norm": 1.7279906281142485, |
|
"learning_rate": 9.658895714862082e-07, |
|
"loss": 1.6331, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.12023901169397705, |
|
"grad_norm": 1.1891919657193728, |
|
"learning_rate": 9.656798378964918e-07, |
|
"loss": 1.6111, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.1206055940467026, |
|
"grad_norm": 1.7749941957068498, |
|
"learning_rate": 9.654694844037607e-07, |
|
"loss": 1.666, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.12097217639942813, |
|
"grad_norm": 1.5093366351881725, |
|
"learning_rate": 9.65258511288033e-07, |
|
"loss": 1.6569, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.12133875875215368, |
|
"grad_norm": 1.2872309950824516, |
|
"learning_rate": 9.650469188301512e-07, |
|
"loss": 1.6697, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.12170534110487921, |
|
"grad_norm": 1.2299002535631731, |
|
"learning_rate": 9.648347073117832e-07, |
|
"loss": 1.6413, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.12207192345760474, |
|
"grad_norm": 1.407253463937065, |
|
"learning_rate": 9.6462187701542e-07, |
|
"loss": 1.6757, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.12243850581033029, |
|
"grad_norm": 1.166071729039829, |
|
"learning_rate": 9.644084282243768e-07, |
|
"loss": 1.6654, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.12280508816305583, |
|
"grad_norm": 1.558952263125209, |
|
"learning_rate": 9.641943612227921e-07, |
|
"loss": 1.6807, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.12317167051578137, |
|
"grad_norm": 1.3374281457093373, |
|
"learning_rate": 9.639796762956276e-07, |
|
"loss": 1.6664, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.1235382528685069, |
|
"grad_norm": 1.1902844247942133, |
|
"learning_rate": 9.637643737286667e-07, |
|
"loss": 1.6914, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.12390483522123245, |
|
"grad_norm": 1.2998133772041194, |
|
"learning_rate": 9.63548453808516e-07, |
|
"loss": 1.7112, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.12427141757395799, |
|
"grad_norm": 1.3162405748836254, |
|
"learning_rate": 9.633319168226036e-07, |
|
"loss": 1.6936, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.12463799992668353, |
|
"grad_norm": 1.3677758198871173, |
|
"learning_rate": 9.631147630591782e-07, |
|
"loss": 1.6883, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.12500458227940908, |
|
"grad_norm": 1.2054292111865461, |
|
"learning_rate": 9.62896992807311e-07, |
|
"loss": 1.6576, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.1253711646321346, |
|
"grad_norm": 1.156101638091166, |
|
"learning_rate": 9.626786063568925e-07, |
|
"loss": 1.6667, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.12573774698486015, |
|
"grad_norm": 1.3745543808654352, |
|
"learning_rate": 9.624596039986343e-07, |
|
"loss": 1.6712, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.1261043293375857, |
|
"grad_norm": 1.178401890967186, |
|
"learning_rate": 9.622399860240679e-07, |
|
"loss": 1.6474, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.12647091169031122, |
|
"grad_norm": 1.4332376083467566, |
|
"learning_rate": 9.620197527255436e-07, |
|
"loss": 1.6655, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.12683749404303676, |
|
"grad_norm": 1.2402171846377348, |
|
"learning_rate": 9.617989043962315e-07, |
|
"loss": 1.6349, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.1272040763957623, |
|
"grad_norm": 1.1586534075249035, |
|
"learning_rate": 9.615774413301201e-07, |
|
"loss": 1.6514, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.12757065874848786, |
|
"grad_norm": 1.3594354851138566, |
|
"learning_rate": 9.613553638220162e-07, |
|
"loss": 1.6516, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.12793724110121338, |
|
"grad_norm": 1.6613648157437189, |
|
"learning_rate": 9.611326721675447e-07, |
|
"loss": 1.6111, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.12830382345393893, |
|
"grad_norm": 1.1659314128590663, |
|
"learning_rate": 9.60909366663148e-07, |
|
"loss": 1.6144, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.12867040580666447, |
|
"grad_norm": 1.3825427999836462, |
|
"learning_rate": 9.606854476060858e-07, |
|
"loss": 1.6355, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.12903698815939002, |
|
"grad_norm": 1.3221664320987678, |
|
"learning_rate": 9.604609152944339e-07, |
|
"loss": 1.6582, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.12940357051211554, |
|
"grad_norm": 1.223865417664176, |
|
"learning_rate": 9.602357700270848e-07, |
|
"loss": 1.6629, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.1297701528648411, |
|
"grad_norm": 1.2654800350319806, |
|
"learning_rate": 9.600100121037478e-07, |
|
"loss": 1.6746, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.13013673521756663, |
|
"grad_norm": 1.5629673478694224, |
|
"learning_rate": 9.597836418249463e-07, |
|
"loss": 1.598, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.13050331757029215, |
|
"grad_norm": 1.434783120339992, |
|
"learning_rate": 9.5955665949202e-07, |
|
"loss": 1.6667, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.1308698999230177, |
|
"grad_norm": 1.391092196783546, |
|
"learning_rate": 9.593290654071227e-07, |
|
"loss": 1.6533, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.13123648227574325, |
|
"grad_norm": 1.4923072292703214, |
|
"learning_rate": 9.591008598732227e-07, |
|
"loss": 1.6742, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.1316030646284688, |
|
"grad_norm": 1.313620532521857, |
|
"learning_rate": 9.588720431941024e-07, |
|
"loss": 1.643, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.13196964698119432, |
|
"grad_norm": 1.527900388849829, |
|
"learning_rate": 9.586426156743576e-07, |
|
"loss": 1.6466, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.13233622933391986, |
|
"grad_norm": 1.3345529937125478, |
|
"learning_rate": 9.584125776193977e-07, |
|
"loss": 1.6242, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.1327028116866454, |
|
"grad_norm": 1.1722053149478573, |
|
"learning_rate": 9.581819293354437e-07, |
|
"loss": 1.6361, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.13306939403937096, |
|
"grad_norm": 1.448965551365503, |
|
"learning_rate": 9.579506711295303e-07, |
|
"loss": 1.6766, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.13343597639209648, |
|
"grad_norm": 1.435539195626326, |
|
"learning_rate": 9.57718803309503e-07, |
|
"loss": 1.6639, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.13380255874482203, |
|
"grad_norm": 1.5710598550118229, |
|
"learning_rate": 9.574863261840195e-07, |
|
"loss": 1.6821, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.13416914109754757, |
|
"grad_norm": 1.3432388820323078, |
|
"learning_rate": 9.572532400625486e-07, |
|
"loss": 1.6578, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.1345357234502731, |
|
"grad_norm": 1.4304292951831412, |
|
"learning_rate": 9.570195452553692e-07, |
|
"loss": 1.6683, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.13490230580299864, |
|
"grad_norm": 1.293030659950829, |
|
"learning_rate": 9.567852420735707e-07, |
|
"loss": 1.6712, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.1352688881557242, |
|
"grad_norm": 1.5727628914988818, |
|
"learning_rate": 9.565503308290529e-07, |
|
"loss": 1.6362, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.13563547050844973, |
|
"grad_norm": 1.6929875598843593, |
|
"learning_rate": 9.56314811834524e-07, |
|
"loss": 1.6734, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.13600205286117525, |
|
"grad_norm": 1.5989548687758315, |
|
"learning_rate": 9.560786854035027e-07, |
|
"loss": 1.6449, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.1363686352139008, |
|
"grad_norm": 1.5032676879166582, |
|
"learning_rate": 9.558419518503146e-07, |
|
"loss": 1.6572, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.13673521756662635, |
|
"grad_norm": 1.4171570128132858, |
|
"learning_rate": 9.55604611490095e-07, |
|
"loss": 1.6084, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.13710179991935187, |
|
"grad_norm": 1.445587424899926, |
|
"learning_rate": 9.553666646387859e-07, |
|
"loss": 1.6226, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.13746838227207742, |
|
"grad_norm": 1.3746442868420083, |
|
"learning_rate": 9.55128111613137e-07, |
|
"loss": 1.6244, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.13783496462480296, |
|
"grad_norm": 1.379515983296158, |
|
"learning_rate": 9.548889527307052e-07, |
|
"loss": 1.6178, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.1382015469775285, |
|
"grad_norm": 1.3571114141269711, |
|
"learning_rate": 9.546491883098536e-07, |
|
"loss": 1.6295, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.13856812933025403, |
|
"grad_norm": 1.463273179907825, |
|
"learning_rate": 9.544088186697514e-07, |
|
"loss": 1.6252, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.13893471168297958, |
|
"grad_norm": 1.409249057690562, |
|
"learning_rate": 9.541678441303736e-07, |
|
"loss": 1.6226, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.13930129403570513, |
|
"grad_norm": 1.2549772425250405, |
|
"learning_rate": 9.539262650125003e-07, |
|
"loss": 1.6904, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.13966787638843067, |
|
"grad_norm": 1.398529314496367, |
|
"learning_rate": 9.536840816377163e-07, |
|
"loss": 1.641, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.1400344587411562, |
|
"grad_norm": 1.4089240361542354, |
|
"learning_rate": 9.534412943284111e-07, |
|
"loss": 1.6749, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.14040104109388174, |
|
"grad_norm": 1.2690921990550241, |
|
"learning_rate": 9.53197903407778e-07, |
|
"loss": 1.6483, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.1407676234466073, |
|
"grad_norm": 1.443019453596183, |
|
"learning_rate": 9.529539091998138e-07, |
|
"loss": 1.5942, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.1411342057993328, |
|
"grad_norm": 1.3973353826502415, |
|
"learning_rate": 9.527093120293179e-07, |
|
"loss": 1.6637, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.14150078815205835, |
|
"grad_norm": 1.612241752672322, |
|
"learning_rate": 9.524641122218934e-07, |
|
"loss": 1.6144, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.1418673705047839, |
|
"grad_norm": 1.6392078912198202, |
|
"learning_rate": 9.522183101039447e-07, |
|
"loss": 1.599, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.14223395285750945, |
|
"grad_norm": 1.3307238721886945, |
|
"learning_rate": 9.519719060026784e-07, |
|
"loss": 1.6692, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.14260053521023497, |
|
"grad_norm": 1.3570795255125636, |
|
"learning_rate": 9.517249002461023e-07, |
|
"loss": 1.6871, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.14296711756296052, |
|
"grad_norm": 1.4037736413570712, |
|
"learning_rate": 9.514772931630253e-07, |
|
"loss": 1.5922, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.14333369991568606, |
|
"grad_norm": 1.6691508908927133, |
|
"learning_rate": 9.512290850830564e-07, |
|
"loss": 1.5939, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.1437002822684116, |
|
"grad_norm": 1.2746936442730004, |
|
"learning_rate": 9.509802763366052e-07, |
|
"loss": 1.6376, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.14406686462113713, |
|
"grad_norm": 1.7263750991736497, |
|
"learning_rate": 9.507308672548803e-07, |
|
"loss": 1.6251, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.14443344697386268, |
|
"grad_norm": 1.6162337099963227, |
|
"learning_rate": 9.504808581698898e-07, |
|
"loss": 1.6855, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.14480002932658823, |
|
"grad_norm": 1.4400774058967862, |
|
"learning_rate": 9.502302494144405e-07, |
|
"loss": 1.6688, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.14516661167931375, |
|
"grad_norm": 1.4106971014212684, |
|
"learning_rate": 9.499790413221372e-07, |
|
"loss": 1.6212, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.1455331940320393, |
|
"grad_norm": 1.549216443416639, |
|
"learning_rate": 9.49727234227383e-07, |
|
"loss": 1.6316, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.14589977638476484, |
|
"grad_norm": 1.2499725096259189, |
|
"learning_rate": 9.494748284653779e-07, |
|
"loss": 1.6113, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.1462663587374904, |
|
"grad_norm": 1.8429540203762498, |
|
"learning_rate": 9.492218243721192e-07, |
|
"loss": 1.6424, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.1466329410902159, |
|
"grad_norm": 1.4097823826329705, |
|
"learning_rate": 9.489682222844004e-07, |
|
"loss": 1.5986, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1466329410902159, |
|
"eval_accuracy": 0.634133690356089, |
|
"eval_loss": 1.6327084302902222, |
|
"eval_runtime": 310.7367, |
|
"eval_samples_per_second": 10.642, |
|
"eval_steps_per_second": 0.888, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.14699952344294145, |
|
"grad_norm": 1.4923503061339742, |
|
"learning_rate": 9.487140225398112e-07, |
|
"loss": 1.6354, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.147366105795667, |
|
"grad_norm": 1.4794551483340477, |
|
"learning_rate": 9.484592254767368e-07, |
|
"loss": 1.6337, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.14773268814839255, |
|
"grad_norm": 1.5712257291796352, |
|
"learning_rate": 9.482038314343577e-07, |
|
"loss": 1.6569, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.14809927050111807, |
|
"grad_norm": 1.7977345143090582, |
|
"learning_rate": 9.479478407526489e-07, |
|
"loss": 1.6489, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.14846585285384362, |
|
"grad_norm": 1.3741458319499518, |
|
"learning_rate": 9.476912537723797e-07, |
|
"loss": 1.6133, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.14883243520656916, |
|
"grad_norm": 1.4690331639136838, |
|
"learning_rate": 9.474340708351131e-07, |
|
"loss": 1.6232, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.14919901755929468, |
|
"grad_norm": 1.2959341038239927, |
|
"learning_rate": 9.471762922832059e-07, |
|
"loss": 1.6136, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.14956559991202023, |
|
"grad_norm": 1.3662274482371721, |
|
"learning_rate": 9.469179184598068e-07, |
|
"loss": 1.6568, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.14993218226474578, |
|
"grad_norm": 1.6303487241504246, |
|
"learning_rate": 9.46658949708858e-07, |
|
"loss": 1.5929, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.15029876461747133, |
|
"grad_norm": 1.5690296034603222, |
|
"learning_rate": 9.463993863750927e-07, |
|
"loss": 1.6273, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.15066534697019685, |
|
"grad_norm": 1.4565888691647535, |
|
"learning_rate": 9.461392288040364e-07, |
|
"loss": 1.6111, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.1510319293229224, |
|
"grad_norm": 1.3399651168141258, |
|
"learning_rate": 9.458784773420052e-07, |
|
"loss": 1.6317, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.15139851167564794, |
|
"grad_norm": 1.4314663401678571, |
|
"learning_rate": 9.456171323361057e-07, |
|
"loss": 1.6149, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.1517650940283735, |
|
"grad_norm": 1.8610614612324794, |
|
"learning_rate": 9.45355194134235e-07, |
|
"loss": 1.6129, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.152131676381099, |
|
"grad_norm": 1.4894532553388709, |
|
"learning_rate": 9.450926630850795e-07, |
|
"loss": 1.609, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.15249825873382455, |
|
"grad_norm": 1.4046406522547454, |
|
"learning_rate": 9.44829539538115e-07, |
|
"loss": 1.5696, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.1528648410865501, |
|
"grad_norm": 1.507747542986857, |
|
"learning_rate": 9.445658238436056e-07, |
|
"loss": 1.6105, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.15323142343927562, |
|
"grad_norm": 1.5105255618831799, |
|
"learning_rate": 9.443015163526043e-07, |
|
"loss": 1.6656, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.15359800579200117, |
|
"grad_norm": 1.409667843388443, |
|
"learning_rate": 9.440366174169514e-07, |
|
"loss": 1.6143, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.15396458814472672, |
|
"grad_norm": 1.4899089219548238, |
|
"learning_rate": 9.437711273892748e-07, |
|
"loss": 1.6434, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.15433117049745226, |
|
"grad_norm": 1.3835730704800184, |
|
"learning_rate": 9.435050466229892e-07, |
|
"loss": 1.5896, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.15469775285017778, |
|
"grad_norm": 1.5192649294767298, |
|
"learning_rate": 9.432383754722953e-07, |
|
"loss": 1.5982, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.15506433520290333, |
|
"grad_norm": 1.414847151501446, |
|
"learning_rate": 9.429711142921804e-07, |
|
"loss": 1.6195, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.15543091755562888, |
|
"grad_norm": 1.6343731391974052, |
|
"learning_rate": 9.427032634384166e-07, |
|
"loss": 1.6571, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.15579749990835443, |
|
"grad_norm": 1.3341873108704791, |
|
"learning_rate": 9.424348232675612e-07, |
|
"loss": 1.6592, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.15616408226107995, |
|
"grad_norm": 1.6008064117545706, |
|
"learning_rate": 9.421657941369561e-07, |
|
"loss": 1.5976, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.1565306646138055, |
|
"grad_norm": 1.5239464972441716, |
|
"learning_rate": 9.418961764047271e-07, |
|
"loss": 1.6696, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.15689724696653104, |
|
"grad_norm": 1.4769248460119957, |
|
"learning_rate": 9.416259704297836e-07, |
|
"loss": 1.5887, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.15726382931925656, |
|
"grad_norm": 1.5681596592695635, |
|
"learning_rate": 9.413551765718178e-07, |
|
"loss": 1.6013, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.1576304116719821, |
|
"grad_norm": 1.631287334977878, |
|
"learning_rate": 9.410837951913049e-07, |
|
"loss": 1.5945, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.15799699402470765, |
|
"grad_norm": 1.4050312863210865, |
|
"learning_rate": 9.408118266495019e-07, |
|
"loss": 1.6402, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.1583635763774332, |
|
"grad_norm": 1.5578526902775003, |
|
"learning_rate": 9.405392713084475e-07, |
|
"loss": 1.5887, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.15873015873015872, |
|
"grad_norm": 1.838536265304532, |
|
"learning_rate": 9.402661295309613e-07, |
|
"loss": 1.6579, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.15909674108288427, |
|
"grad_norm": 1.399860997384879, |
|
"learning_rate": 9.399924016806442e-07, |
|
"loss": 1.6393, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.15946332343560982, |
|
"grad_norm": 1.5068872354692342, |
|
"learning_rate": 9.397180881218764e-07, |
|
"loss": 1.615, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.15982990578833534, |
|
"grad_norm": 1.3780932641355175, |
|
"learning_rate": 9.394431892198187e-07, |
|
"loss": 1.5897, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.16019648814106088, |
|
"grad_norm": 1.3266983904985465, |
|
"learning_rate": 9.391677053404102e-07, |
|
"loss": 1.622, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.16056307049378643, |
|
"grad_norm": 1.620877234564149, |
|
"learning_rate": 9.388916368503695e-07, |
|
"loss": 1.5967, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.16092965284651198, |
|
"grad_norm": 1.4779982203811086, |
|
"learning_rate": 9.386149841171927e-07, |
|
"loss": 1.6698, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.1612962351992375, |
|
"grad_norm": 1.8674907963100393, |
|
"learning_rate": 9.38337747509154e-07, |
|
"loss": 1.587, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.16166281755196305, |
|
"grad_norm": 1.253158061665667, |
|
"learning_rate": 9.380599273953052e-07, |
|
"loss": 1.5428, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.1620293999046886, |
|
"grad_norm": 1.3525050799204679, |
|
"learning_rate": 9.37781524145474e-07, |
|
"loss": 1.6247, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.16239598225741414, |
|
"grad_norm": 1.4613300416955568, |
|
"learning_rate": 9.375025381302654e-07, |
|
"loss": 1.6224, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.16276256461013966, |
|
"grad_norm": 1.2944336505844816, |
|
"learning_rate": 9.372229697210592e-07, |
|
"loss": 1.6073, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.1631291469628652, |
|
"grad_norm": 1.5174622698952627, |
|
"learning_rate": 9.369428192900108e-07, |
|
"loss": 1.6071, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.16349572931559075, |
|
"grad_norm": 1.338534858401422, |
|
"learning_rate": 9.366620872100508e-07, |
|
"loss": 1.6601, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.16386231166831627, |
|
"grad_norm": 1.6728271928417346, |
|
"learning_rate": 9.363807738548834e-07, |
|
"loss": 1.551, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.16422889402104182, |
|
"grad_norm": 1.302057455107361, |
|
"learning_rate": 9.360988795989873e-07, |
|
"loss": 1.6131, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.16459547637376737, |
|
"grad_norm": 1.3688499844245678, |
|
"learning_rate": 9.358164048176136e-07, |
|
"loss": 1.6117, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.16496205872649292, |
|
"grad_norm": 1.8246828901080199, |
|
"learning_rate": 9.355333498867869e-07, |
|
"loss": 1.5894, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.16532864107921844, |
|
"grad_norm": 1.6028775096282735, |
|
"learning_rate": 9.352497151833038e-07, |
|
"loss": 1.614, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.16569522343194398, |
|
"grad_norm": 1.4820831927771527, |
|
"learning_rate": 9.349655010847329e-07, |
|
"loss": 1.6046, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.16606180578466953, |
|
"grad_norm": 1.7672157547664196, |
|
"learning_rate": 9.346807079694139e-07, |
|
"loss": 1.5998, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.16642838813739508, |
|
"grad_norm": 1.399533793932768, |
|
"learning_rate": 9.34395336216457e-07, |
|
"loss": 1.6209, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.1667949704901206, |
|
"grad_norm": 1.3639375879771105, |
|
"learning_rate": 9.341093862057432e-07, |
|
"loss": 1.6321, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.16716155284284615, |
|
"grad_norm": 1.5049904120253712, |
|
"learning_rate": 9.338228583179231e-07, |
|
"loss": 1.5531, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.1675281351955717, |
|
"grad_norm": 1.2985124195396522, |
|
"learning_rate": 9.335357529344162e-07, |
|
"loss": 1.5925, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.1678947175482972, |
|
"grad_norm": 1.6446327484619145, |
|
"learning_rate": 9.332480704374113e-07, |
|
"loss": 1.5926, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.16826129990102276, |
|
"grad_norm": 1.6322229820052805, |
|
"learning_rate": 9.329598112098649e-07, |
|
"loss": 1.6415, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.1686278822537483, |
|
"grad_norm": 1.4469690988313273, |
|
"learning_rate": 9.326709756355018e-07, |
|
"loss": 1.5885, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.16899446460647385, |
|
"grad_norm": 2.0102392352379415, |
|
"learning_rate": 9.323815640988135e-07, |
|
"loss": 1.559, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.16936104695919937, |
|
"grad_norm": 2.121900247865438, |
|
"learning_rate": 9.320915769850585e-07, |
|
"loss": 1.628, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.16972762931192492, |
|
"grad_norm": 1.6562713457587275, |
|
"learning_rate": 9.318010146802615e-07, |
|
"loss": 1.6442, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.17009421166465047, |
|
"grad_norm": 1.825933954099794, |
|
"learning_rate": 9.315098775712127e-07, |
|
"loss": 1.5848, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.17046079401737602, |
|
"grad_norm": 2.2902161148174445, |
|
"learning_rate": 9.312181660454677e-07, |
|
"loss": 1.5825, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.17082737637010154, |
|
"grad_norm": 1.392734199429953, |
|
"learning_rate": 9.309258804913465e-07, |
|
"loss": 1.6126, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.17119395872282708, |
|
"grad_norm": 1.565256666892175, |
|
"learning_rate": 9.306330212979334e-07, |
|
"loss": 1.6022, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.17156054107555263, |
|
"grad_norm": 1.7600380550932417, |
|
"learning_rate": 9.303395888550763e-07, |
|
"loss": 1.5663, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.17192712342827815, |
|
"grad_norm": 1.5247880984614344, |
|
"learning_rate": 9.300455835533863e-07, |
|
"loss": 1.6012, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.1722937057810037, |
|
"grad_norm": 1.7352070019598504, |
|
"learning_rate": 9.297510057842367e-07, |
|
"loss": 1.5681, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.17266028813372924, |
|
"grad_norm": 1.6435683033446582, |
|
"learning_rate": 9.294558559397633e-07, |
|
"loss": 1.6687, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.1730268704864548, |
|
"grad_norm": 1.3964234370853204, |
|
"learning_rate": 9.291601344128631e-07, |
|
"loss": 1.5829, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.1733934528391803, |
|
"grad_norm": 1.76715189072495, |
|
"learning_rate": 9.288638415971944e-07, |
|
"loss": 1.5724, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.17376003519190586, |
|
"grad_norm": 1.3087839062281306, |
|
"learning_rate": 9.285669778871758e-07, |
|
"loss": 1.6033, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.1741266175446314, |
|
"grad_norm": 1.7592015890177557, |
|
"learning_rate": 9.282695436779857e-07, |
|
"loss": 1.5787, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.17449319989735695, |
|
"grad_norm": 1.5281595493710598, |
|
"learning_rate": 9.279715393655625e-07, |
|
"loss": 1.5593, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.17485978225008247, |
|
"grad_norm": 1.738599325299021, |
|
"learning_rate": 9.276729653466029e-07, |
|
"loss": 1.5669, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.17522636460280802, |
|
"grad_norm": 1.594132633669574, |
|
"learning_rate": 9.273738220185624e-07, |
|
"loss": 1.623, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.17559294695553357, |
|
"grad_norm": 2.226861365359913, |
|
"learning_rate": 9.27074109779654e-07, |
|
"loss": 1.6368, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.1759595293082591, |
|
"grad_norm": 1.7870988536401553, |
|
"learning_rate": 9.267738290288484e-07, |
|
"loss": 1.5905, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.17632611166098464, |
|
"grad_norm": 1.6753244560734581, |
|
"learning_rate": 9.264729801658726e-07, |
|
"loss": 1.588, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.17669269401371018, |
|
"grad_norm": 1.5163383708898754, |
|
"learning_rate": 9.261715635912105e-07, |
|
"loss": 1.6068, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.17705927636643573, |
|
"grad_norm": 1.6054513357762625, |
|
"learning_rate": 9.258695797061011e-07, |
|
"loss": 1.5623, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.17742585871916125, |
|
"grad_norm": 1.7549519455125482, |
|
"learning_rate": 9.255670289125392e-07, |
|
"loss": 1.6342, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.1777924410718868, |
|
"grad_norm": 1.5524081159338652, |
|
"learning_rate": 9.252639116132737e-07, |
|
"loss": 1.5866, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.17815902342461234, |
|
"grad_norm": 1.5466546969225983, |
|
"learning_rate": 9.249602282118078e-07, |
|
"loss": 1.6022, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.1785256057773379, |
|
"grad_norm": 1.4959615382996556, |
|
"learning_rate": 9.246559791123984e-07, |
|
"loss": 1.6196, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.1788921881300634, |
|
"grad_norm": 1.4914720900146645, |
|
"learning_rate": 9.243511647200554e-07, |
|
"loss": 1.5919, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.17925877048278896, |
|
"grad_norm": 1.5337435868741187, |
|
"learning_rate": 9.240457854405411e-07, |
|
"loss": 1.6044, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.1796253528355145, |
|
"grad_norm": 1.6816858785763387, |
|
"learning_rate": 9.237398416803702e-07, |
|
"loss": 1.5634, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.17999193518824003, |
|
"grad_norm": 1.8428666379108207, |
|
"learning_rate": 9.234333338468079e-07, |
|
"loss": 1.5595, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.18035851754096557, |
|
"grad_norm": 1.4112423758680814, |
|
"learning_rate": 9.231262623478712e-07, |
|
"loss": 1.5958, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.18072509989369112, |
|
"grad_norm": 1.9379415330464052, |
|
"learning_rate": 9.228186275923271e-07, |
|
"loss": 1.6132, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.18109168224641667, |
|
"grad_norm": 1.6478659028610085, |
|
"learning_rate": 9.225104299896923e-07, |
|
"loss": 1.5253, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.1814582645991422, |
|
"grad_norm": 1.4723128432871142, |
|
"learning_rate": 9.222016699502329e-07, |
|
"loss": 1.6025, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.18182484695186774, |
|
"grad_norm": 1.7186069161894069, |
|
"learning_rate": 9.218923478849636e-07, |
|
"loss": 1.5888, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.18219142930459328, |
|
"grad_norm": 2.0518524516759706, |
|
"learning_rate": 9.215824642056473e-07, |
|
"loss": 1.6131, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.18255801165731883, |
|
"grad_norm": 1.7336503978028492, |
|
"learning_rate": 9.212720193247946e-07, |
|
"loss": 1.5725, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.18292459401004435, |
|
"grad_norm": 1.4722133429873332, |
|
"learning_rate": 9.209610136556629e-07, |
|
"loss": 1.5547, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.1832911763627699, |
|
"grad_norm": 1.6753596780660358, |
|
"learning_rate": 9.206494476122565e-07, |
|
"loss": 1.5997, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.1832911763627699, |
|
"eval_accuracy": 0.642745649510724, |
|
"eval_loss": 1.587723731994629, |
|
"eval_runtime": 309.6063, |
|
"eval_samples_per_second": 10.681, |
|
"eval_steps_per_second": 0.891, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.18365775871549544, |
|
"grad_norm": 1.5685677710443469, |
|
"learning_rate": 9.203373216093253e-07, |
|
"loss": 1.5679, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.18402434106822096, |
|
"grad_norm": 1.8335955057050302, |
|
"learning_rate": 9.200246360623647e-07, |
|
"loss": 1.5621, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.1843909234209465, |
|
"grad_norm": 1.522191845438261, |
|
"learning_rate": 9.19711391387615e-07, |
|
"loss": 1.5729, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.18475750577367206, |
|
"grad_norm": 1.6776006382527855, |
|
"learning_rate": 9.193975880020609e-07, |
|
"loss": 1.59, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.1851240881263976, |
|
"grad_norm": 1.626198881855077, |
|
"learning_rate": 9.190832263234307e-07, |
|
"loss": 1.5274, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.18549067047912313, |
|
"grad_norm": 1.7849118070867178, |
|
"learning_rate": 9.18768306770196e-07, |
|
"loss": 1.5976, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.18585725283184867, |
|
"grad_norm": 1.6492509263028217, |
|
"learning_rate": 9.184528297615706e-07, |
|
"loss": 1.574, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.18622383518457422, |
|
"grad_norm": 1.6650634512326183, |
|
"learning_rate": 9.181367957175111e-07, |
|
"loss": 1.6145, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.18659041753729974, |
|
"grad_norm": 1.728522905813247, |
|
"learning_rate": 9.178202050587152e-07, |
|
"loss": 1.623, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.1869569998900253, |
|
"grad_norm": 1.5996442049523565, |
|
"learning_rate": 9.175030582066215e-07, |
|
"loss": 1.5807, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.18732358224275084, |
|
"grad_norm": 2.127736796999369, |
|
"learning_rate": 9.17185355583409e-07, |
|
"loss": 1.6288, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.18769016459547638, |
|
"grad_norm": 1.7060344023543381, |
|
"learning_rate": 9.16867097611997e-07, |
|
"loss": 1.5706, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.1880567469482019, |
|
"grad_norm": 1.6633154215840553, |
|
"learning_rate": 9.165482847160433e-07, |
|
"loss": 1.6202, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.18842332930092745, |
|
"grad_norm": 2.008854546754292, |
|
"learning_rate": 9.162289173199449e-07, |
|
"loss": 1.5684, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.188789911653653, |
|
"grad_norm": 1.8267125273776432, |
|
"learning_rate": 9.159089958488368e-07, |
|
"loss": 1.5463, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.18915649400637854, |
|
"grad_norm": 1.5564239251002085, |
|
"learning_rate": 9.155885207285917e-07, |
|
"loss": 1.5432, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.18952307635910406, |
|
"grad_norm": 1.6146271060205803, |
|
"learning_rate": 9.152674923858192e-07, |
|
"loss": 1.5524, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.1898896587118296, |
|
"grad_norm": 1.5552810397285535, |
|
"learning_rate": 9.149459112478653e-07, |
|
"loss": 1.5704, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.19025624106455516, |
|
"grad_norm": 1.5384519496242604, |
|
"learning_rate": 9.146237777428119e-07, |
|
"loss": 1.5832, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.19062282341728068, |
|
"grad_norm": 2.017102331377888, |
|
"learning_rate": 9.143010922994761e-07, |
|
"loss": 1.5652, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.19098940577000623, |
|
"grad_norm": 1.8257390842642465, |
|
"learning_rate": 9.139778553474102e-07, |
|
"loss": 1.6286, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.19135598812273177, |
|
"grad_norm": 1.8375892545538077, |
|
"learning_rate": 9.136540673169e-07, |
|
"loss": 1.5999, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.19172257047545732, |
|
"grad_norm": 2.0587302949543327, |
|
"learning_rate": 9.133297286389652e-07, |
|
"loss": 1.5976, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.19208915282818284, |
|
"grad_norm": 2.011881523827466, |
|
"learning_rate": 9.130048397453586e-07, |
|
"loss": 1.5948, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.1924557351809084, |
|
"grad_norm": 1.8390608792602066, |
|
"learning_rate": 9.126794010685652e-07, |
|
"loss": 1.6149, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.19282231753363394, |
|
"grad_norm": 1.9246481251033047, |
|
"learning_rate": 9.123534130418022e-07, |
|
"loss": 1.5918, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.19318889988635948, |
|
"grad_norm": 1.716961973736044, |
|
"learning_rate": 9.120268760990177e-07, |
|
"loss": 1.5423, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.193555482239085, |
|
"grad_norm": 2.0653331266058053, |
|
"learning_rate": 9.116997906748906e-07, |
|
"loss": 1.5646, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.19392206459181055, |
|
"grad_norm": 1.518359023904073, |
|
"learning_rate": 9.113721572048303e-07, |
|
"loss": 1.5893, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.1942886469445361, |
|
"grad_norm": 1.5221964255305394, |
|
"learning_rate": 9.110439761249752e-07, |
|
"loss": 1.5944, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.19465522929726162, |
|
"grad_norm": 1.591016019300809, |
|
"learning_rate": 9.107152478721929e-07, |
|
"loss": 1.5957, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.19502181164998716, |
|
"grad_norm": 1.6048630337553804, |
|
"learning_rate": 9.103859728840797e-07, |
|
"loss": 1.5373, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.1953883940027127, |
|
"grad_norm": 1.8089344462427293, |
|
"learning_rate": 9.10056151598959e-07, |
|
"loss": 1.5484, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.19575497635543826, |
|
"grad_norm": 1.7077347921127968, |
|
"learning_rate": 9.097257844558821e-07, |
|
"loss": 1.5688, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.19612155870816378, |
|
"grad_norm": 2.0584080275062706, |
|
"learning_rate": 9.093948718946265e-07, |
|
"loss": 1.5202, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.19648814106088933, |
|
"grad_norm": 1.6275162784009292, |
|
"learning_rate": 9.090634143556961e-07, |
|
"loss": 1.5851, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.19685472341361487, |
|
"grad_norm": 1.7941515009032263, |
|
"learning_rate": 9.087314122803198e-07, |
|
"loss": 1.5794, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.19722130576634042, |
|
"grad_norm": 1.72604148825101, |
|
"learning_rate": 9.083988661104519e-07, |
|
"loss": 1.5966, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.19758788811906594, |
|
"grad_norm": 1.7824620622659664, |
|
"learning_rate": 9.080657762887706e-07, |
|
"loss": 1.5893, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.1979544704717915, |
|
"grad_norm": 1.710078177829696, |
|
"learning_rate": 9.077321432586779e-07, |
|
"loss": 1.5668, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.19832105282451704, |
|
"grad_norm": 1.8516264946489545, |
|
"learning_rate": 9.073979674642991e-07, |
|
"loss": 1.6049, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.19868763517724256, |
|
"grad_norm": 2.1561627747886583, |
|
"learning_rate": 9.070632493504815e-07, |
|
"loss": 1.585, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.1990542175299681, |
|
"grad_norm": 1.912041110250784, |
|
"learning_rate": 9.06727989362795e-07, |
|
"loss": 1.5196, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.19942079988269365, |
|
"grad_norm": 1.8404077118276456, |
|
"learning_rate": 9.063921879475306e-07, |
|
"loss": 1.611, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.1997873822354192, |
|
"grad_norm": 1.5865821224681815, |
|
"learning_rate": 9.060558455516996e-07, |
|
"loss": 1.5739, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.20015396458814472, |
|
"grad_norm": 1.9756512969668862, |
|
"learning_rate": 9.057189626230341e-07, |
|
"loss": 1.5002, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.20052054694087026, |
|
"grad_norm": 1.5812577707350812, |
|
"learning_rate": 9.053815396099851e-07, |
|
"loss": 1.5869, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.2008871292935958, |
|
"grad_norm": 2.0162867580185555, |
|
"learning_rate": 9.050435769617231e-07, |
|
"loss": 1.5559, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.20125371164632136, |
|
"grad_norm": 1.899649598636165, |
|
"learning_rate": 9.047050751281368e-07, |
|
"loss": 1.5407, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.20162029399904688, |
|
"grad_norm": 1.9101266806326496, |
|
"learning_rate": 9.043660345598322e-07, |
|
"loss": 1.5576, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.20198687635177243, |
|
"grad_norm": 2.0420669589479403, |
|
"learning_rate": 9.040264557081334e-07, |
|
"loss": 1.557, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.20235345870449797, |
|
"grad_norm": 1.9260883055795428, |
|
"learning_rate": 9.036863390250801e-07, |
|
"loss": 1.5521, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.2027200410572235, |
|
"grad_norm": 1.6555197284342995, |
|
"learning_rate": 9.033456849634284e-07, |
|
"loss": 1.5717, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.20308662340994904, |
|
"grad_norm": 2.153362825776131, |
|
"learning_rate": 9.030044939766497e-07, |
|
"loss": 1.5713, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.2034532057626746, |
|
"grad_norm": 1.910089724316295, |
|
"learning_rate": 9.026627665189303e-07, |
|
"loss": 1.5697, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.20381978811540014, |
|
"grad_norm": 1.7762617538543, |
|
"learning_rate": 9.0232050304517e-07, |
|
"loss": 1.5239, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.20418637046812566, |
|
"grad_norm": 1.7174298843577596, |
|
"learning_rate": 9.019777040109831e-07, |
|
"loss": 1.5276, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.2045529528208512, |
|
"grad_norm": 1.6862369469038345, |
|
"learning_rate": 9.016343698726961e-07, |
|
"loss": 1.5541, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.20491953517357675, |
|
"grad_norm": 1.875834526669963, |
|
"learning_rate": 9.01290501087348e-07, |
|
"loss": 1.555, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.2052861175263023, |
|
"grad_norm": 1.7840227955187389, |
|
"learning_rate": 9.009460981126898e-07, |
|
"loss": 1.5872, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.20565269987902782, |
|
"grad_norm": 1.668168953110993, |
|
"learning_rate": 9.006011614071829e-07, |
|
"loss": 1.599, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.20601928223175336, |
|
"grad_norm": 1.6951419814826267, |
|
"learning_rate": 9.002556914300001e-07, |
|
"loss": 1.5599, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.2063858645844789, |
|
"grad_norm": 2.031183645077938, |
|
"learning_rate": 8.999096886410234e-07, |
|
"loss": 1.5697, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.20675244693720443, |
|
"grad_norm": 2.2433698552413595, |
|
"learning_rate": 8.995631535008442e-07, |
|
"loss": 1.5751, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.20711902928992998, |
|
"grad_norm": 1.96339871171306, |
|
"learning_rate": 8.992160864707629e-07, |
|
"loss": 1.5922, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.20748561164265553, |
|
"grad_norm": 1.7341008984989021, |
|
"learning_rate": 8.988684880127877e-07, |
|
"loss": 1.5476, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.20785219399538107, |
|
"grad_norm": 1.6011033018349554, |
|
"learning_rate": 8.985203585896339e-07, |
|
"loss": 1.5337, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.2082187763481066, |
|
"grad_norm": 1.804008259917083, |
|
"learning_rate": 8.981716986647241e-07, |
|
"loss": 1.548, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.20858535870083214, |
|
"grad_norm": 1.7644993504571036, |
|
"learning_rate": 8.978225087021872e-07, |
|
"loss": 1.5566, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.2089519410535577, |
|
"grad_norm": 2.1995890332913812, |
|
"learning_rate": 8.974727891668568e-07, |
|
"loss": 1.509, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.2093185234062832, |
|
"grad_norm": 1.7307439040874695, |
|
"learning_rate": 8.971225405242724e-07, |
|
"loss": 1.5792, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.20968510575900876, |
|
"grad_norm": 1.8843347719325225, |
|
"learning_rate": 8.967717632406775e-07, |
|
"loss": 1.5745, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.2100516881117343, |
|
"grad_norm": 1.8994279922279045, |
|
"learning_rate": 8.964204577830193e-07, |
|
"loss": 1.5346, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.21041827046445985, |
|
"grad_norm": 2.0146207080838305, |
|
"learning_rate": 8.960686246189479e-07, |
|
"loss": 1.5724, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.21078485281718537, |
|
"grad_norm": 1.9175010632666802, |
|
"learning_rate": 8.957162642168164e-07, |
|
"loss": 1.482, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.21115143516991092, |
|
"grad_norm": 1.6492564643172203, |
|
"learning_rate": 8.953633770456791e-07, |
|
"loss": 1.5635, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.21151801752263646, |
|
"grad_norm": 1.8913486368556613, |
|
"learning_rate": 8.950099635752919e-07, |
|
"loss": 1.5634, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.211884599875362, |
|
"grad_norm": 1.7405053491856226, |
|
"learning_rate": 8.946560242761114e-07, |
|
"loss": 1.5475, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.21225118222808753, |
|
"grad_norm": 1.7166883252641594, |
|
"learning_rate": 8.943015596192938e-07, |
|
"loss": 1.516, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.21261776458081308, |
|
"grad_norm": 1.935712334758643, |
|
"learning_rate": 8.93946570076695e-07, |
|
"loss": 1.5575, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.21298434693353863, |
|
"grad_norm": 1.9385604701128256, |
|
"learning_rate": 8.935910561208693e-07, |
|
"loss": 1.5634, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.21335092928626415, |
|
"grad_norm": 2.557688500744313, |
|
"learning_rate": 8.932350182250694e-07, |
|
"loss": 1.5103, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.2137175116389897, |
|
"grad_norm": 1.7120107495237882, |
|
"learning_rate": 8.928784568632454e-07, |
|
"loss": 1.5332, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.21408409399171524, |
|
"grad_norm": 1.9120958570178155, |
|
"learning_rate": 8.925213725100439e-07, |
|
"loss": 1.5902, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.2144506763444408, |
|
"grad_norm": 2.0551912368717984, |
|
"learning_rate": 8.921637656408081e-07, |
|
"loss": 1.5784, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.2148172586971663, |
|
"grad_norm": 1.9480411905431083, |
|
"learning_rate": 8.918056367315765e-07, |
|
"loss": 1.5551, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.21518384104989186, |
|
"grad_norm": 2.072902657734444, |
|
"learning_rate": 8.914469862590825e-07, |
|
"loss": 1.5555, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.2155504234026174, |
|
"grad_norm": 1.9451661388320578, |
|
"learning_rate": 8.910878147007544e-07, |
|
"loss": 1.5513, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.21591700575534295, |
|
"grad_norm": 2.0629785589418104, |
|
"learning_rate": 8.907281225347132e-07, |
|
"loss": 1.5553, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.21628358810806847, |
|
"grad_norm": 1.863954721076218, |
|
"learning_rate": 8.903679102397735e-07, |
|
"loss": 1.5691, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.21665017046079402, |
|
"grad_norm": 1.8545804685124208, |
|
"learning_rate": 8.900071782954424e-07, |
|
"loss": 1.5331, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.21701675281351956, |
|
"grad_norm": 1.8522158136831326, |
|
"learning_rate": 8.896459271819181e-07, |
|
"loss": 1.5481, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.21738333516624508, |
|
"grad_norm": 2.114169763199409, |
|
"learning_rate": 8.892841573800909e-07, |
|
"loss": 1.5574, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.21774991751897063, |
|
"grad_norm": 2.2195708048317897, |
|
"learning_rate": 8.889218693715405e-07, |
|
"loss": 1.5632, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.21811649987169618, |
|
"grad_norm": 1.9709151192601133, |
|
"learning_rate": 8.885590636385373e-07, |
|
"loss": 1.5861, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.21848308222442173, |
|
"grad_norm": 1.9808333239294875, |
|
"learning_rate": 8.881957406640402e-07, |
|
"loss": 1.5065, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.21884966457714725, |
|
"grad_norm": 2.442742784557856, |
|
"learning_rate": 8.878319009316973e-07, |
|
"loss": 1.5445, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.2192162469298728, |
|
"grad_norm": 2.311119780435353, |
|
"learning_rate": 8.874675449258439e-07, |
|
"loss": 1.5483, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.21958282928259834, |
|
"grad_norm": 2.0035864035930655, |
|
"learning_rate": 8.871026731315031e-07, |
|
"loss": 1.5516, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.2199494116353239, |
|
"grad_norm": 1.9235134048584597, |
|
"learning_rate": 8.867372860343843e-07, |
|
"loss": 1.5841, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.2199494116353239, |
|
"eval_accuracy": 0.6509060196907062, |
|
"eval_loss": 1.540500521659851, |
|
"eval_runtime": 311.0144, |
|
"eval_samples_per_second": 10.633, |
|
"eval_steps_per_second": 0.887, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.2203159939880494, |
|
"grad_norm": 1.7524109005789064, |
|
"learning_rate": 8.863713841208831e-07, |
|
"loss": 1.5597, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.22068257634077496, |
|
"grad_norm": 1.6692328056749952, |
|
"learning_rate": 8.860049678780803e-07, |
|
"loss": 1.4923, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.2210491586935005, |
|
"grad_norm": 1.9399213197528828, |
|
"learning_rate": 8.856380377937411e-07, |
|
"loss": 1.552, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.22141574104622602, |
|
"grad_norm": 2.2904467183798753, |
|
"learning_rate": 8.852705943563153e-07, |
|
"loss": 1.5254, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.22178232339895157, |
|
"grad_norm": 1.8153750134894717, |
|
"learning_rate": 8.849026380549354e-07, |
|
"loss": 1.5141, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.22214890575167712, |
|
"grad_norm": 2.618147882062693, |
|
"learning_rate": 8.84534169379417e-07, |
|
"loss": 1.5427, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.22251548810440266, |
|
"grad_norm": 1.7910988941866253, |
|
"learning_rate": 8.84165188820258e-07, |
|
"loss": 1.5024, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.22288207045712818, |
|
"grad_norm": 2.1174011777995565, |
|
"learning_rate": 8.837956968686371e-07, |
|
"loss": 1.5354, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.22324865280985373, |
|
"grad_norm": 1.9009206870385398, |
|
"learning_rate": 8.834256940164142e-07, |
|
"loss": 1.5147, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.22361523516257928, |
|
"grad_norm": 1.8496325535415874, |
|
"learning_rate": 8.830551807561291e-07, |
|
"loss": 1.5179, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.22398181751530483, |
|
"grad_norm": 1.662570964745413, |
|
"learning_rate": 8.826841575810011e-07, |
|
"loss": 1.5187, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.22434839986803035, |
|
"grad_norm": 1.8932960142147148, |
|
"learning_rate": 8.823126249849283e-07, |
|
"loss": 1.511, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.2247149822207559, |
|
"grad_norm": 2.055911875635135, |
|
"learning_rate": 8.819405834624869e-07, |
|
"loss": 1.5155, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.22508156457348144, |
|
"grad_norm": 2.0651755539958603, |
|
"learning_rate": 8.815680335089308e-07, |
|
"loss": 1.4753, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.22544814692620696, |
|
"grad_norm": 2.0717254734315405, |
|
"learning_rate": 8.811949756201902e-07, |
|
"loss": 1.5565, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.2258147292789325, |
|
"grad_norm": 1.9847422671401158, |
|
"learning_rate": 8.808214102928721e-07, |
|
"loss": 1.5438, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.22618131163165806, |
|
"grad_norm": 2.4190623603018806, |
|
"learning_rate": 8.804473380242583e-07, |
|
"loss": 1.5399, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.2265478939843836, |
|
"grad_norm": 2.20009570928599, |
|
"learning_rate": 8.80072759312306e-07, |
|
"loss": 1.5398, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.22691447633710912, |
|
"grad_norm": 1.9921790637181438, |
|
"learning_rate": 8.796976746556462e-07, |
|
"loss": 1.4771, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.22728105868983467, |
|
"grad_norm": 2.0203680363068344, |
|
"learning_rate": 8.793220845535838e-07, |
|
"loss": 1.5176, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.22764764104256022, |
|
"grad_norm": 2.7532988176359754, |
|
"learning_rate": 8.789459895060962e-07, |
|
"loss": 1.5371, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.22801422339528576, |
|
"grad_norm": 1.937352911027064, |
|
"learning_rate": 8.785693900138329e-07, |
|
"loss": 1.5356, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.22838080574801128, |
|
"grad_norm": 1.9964616803134492, |
|
"learning_rate": 8.781922865781151e-07, |
|
"loss": 1.56, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.22874738810073683, |
|
"grad_norm": 2.106377863408321, |
|
"learning_rate": 8.778146797009349e-07, |
|
"loss": 1.559, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.22911397045346238, |
|
"grad_norm": 1.6409859726466804, |
|
"learning_rate": 8.774365698849547e-07, |
|
"loss": 1.5116, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.2294805528061879, |
|
"grad_norm": 2.305691070208384, |
|
"learning_rate": 8.770579576335058e-07, |
|
"loss": 1.5683, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.22984713515891345, |
|
"grad_norm": 1.7207294769909895, |
|
"learning_rate": 8.766788434505887e-07, |
|
"loss": 1.4618, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.230213717511639, |
|
"grad_norm": 1.9323445658200624, |
|
"learning_rate": 8.762992278408723e-07, |
|
"loss": 1.5618, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.23058029986436454, |
|
"grad_norm": 1.999152732092489, |
|
"learning_rate": 8.759191113096927e-07, |
|
"loss": 1.5569, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.23094688221709006, |
|
"grad_norm": 1.8502749258838977, |
|
"learning_rate": 8.755384943630529e-07, |
|
"loss": 1.5114, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.2313134645698156, |
|
"grad_norm": 2.0061014414371003, |
|
"learning_rate": 8.751573775076219e-07, |
|
"loss": 1.5011, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.23168004692254116, |
|
"grad_norm": 2.064565021271191, |
|
"learning_rate": 8.747757612507345e-07, |
|
"loss": 1.5588, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.23204662927526667, |
|
"grad_norm": 1.878533236916369, |
|
"learning_rate": 8.743936461003898e-07, |
|
"loss": 1.5179, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.23241321162799222, |
|
"grad_norm": 2.080116702687917, |
|
"learning_rate": 8.740110325652515e-07, |
|
"loss": 1.5211, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.23277979398071777, |
|
"grad_norm": 2.2534624739469433, |
|
"learning_rate": 8.736279211546465e-07, |
|
"loss": 1.5077, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.23314637633344332, |
|
"grad_norm": 2.1778452457873527, |
|
"learning_rate": 8.732443123785644e-07, |
|
"loss": 1.5385, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.23351295868616884, |
|
"grad_norm": 2.0802562378092317, |
|
"learning_rate": 8.72860206747657e-07, |
|
"loss": 1.5053, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 0.23387954103889438, |
|
"grad_norm": 2.197133342414823, |
|
"learning_rate": 8.724756047732376e-07, |
|
"loss": 1.5223, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.23424612339161993, |
|
"grad_norm": 2.3786394596220437, |
|
"learning_rate": 8.720905069672799e-07, |
|
"loss": 1.5124, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.23461270574434548, |
|
"grad_norm": 1.8455501641424978, |
|
"learning_rate": 8.717049138424182e-07, |
|
"loss": 1.525, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.234979288097071, |
|
"grad_norm": 2.0418699202678727, |
|
"learning_rate": 8.713188259119452e-07, |
|
"loss": 1.5082, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 0.23534587044979655, |
|
"grad_norm": 1.8308136052916946, |
|
"learning_rate": 8.709322436898135e-07, |
|
"loss": 1.4779, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.2357124528025221, |
|
"grad_norm": 2.155105815758525, |
|
"learning_rate": 8.705451676906328e-07, |
|
"loss": 1.5101, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 0.2360790351552476, |
|
"grad_norm": 1.9647757860923412, |
|
"learning_rate": 8.701575984296702e-07, |
|
"loss": 1.5105, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.23644561750797316, |
|
"grad_norm": 2.051510082680593, |
|
"learning_rate": 8.6976953642285e-07, |
|
"loss": 1.503, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.2368121998606987, |
|
"grad_norm": 2.1386714707947534, |
|
"learning_rate": 8.693809821867517e-07, |
|
"loss": 1.5282, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.23717878221342426, |
|
"grad_norm": 2.1401411616284167, |
|
"learning_rate": 8.689919362386104e-07, |
|
"loss": 1.4949, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 0.23754536456614977, |
|
"grad_norm": 1.956666297999974, |
|
"learning_rate": 8.686023990963157e-07, |
|
"loss": 1.4993, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.23791194691887532, |
|
"grad_norm": 2.0257118859168672, |
|
"learning_rate": 8.682123712784112e-07, |
|
"loss": 1.5186, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 0.23827852927160087, |
|
"grad_norm": 1.895169068962553, |
|
"learning_rate": 8.678218533040937e-07, |
|
"loss": 1.526, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.23864511162432642, |
|
"grad_norm": 6.529056788123207, |
|
"learning_rate": 8.67430845693212e-07, |
|
"loss": 1.4975, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.23901169397705194, |
|
"grad_norm": 2.078820041783562, |
|
"learning_rate": 8.670393489662673e-07, |
|
"loss": 1.5147, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.23937827632977748, |
|
"grad_norm": 2.313941233193865, |
|
"learning_rate": 8.666473636444116e-07, |
|
"loss": 1.5103, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 0.23974485868250303, |
|
"grad_norm": 2.204068052979437, |
|
"learning_rate": 8.662548902494473e-07, |
|
"loss": 1.5197, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.24011144103522855, |
|
"grad_norm": 2.6677538134182033, |
|
"learning_rate": 8.658619293038265e-07, |
|
"loss": 1.4539, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.2404780233879541, |
|
"grad_norm": 2.1826711924398876, |
|
"learning_rate": 8.654684813306508e-07, |
|
"loss": 1.4569, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.24084460574067965, |
|
"grad_norm": 2.4513733249404037, |
|
"learning_rate": 8.650745468536691e-07, |
|
"loss": 1.472, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.2412111880934052, |
|
"grad_norm": 1.9341316559705668, |
|
"learning_rate": 8.64680126397279e-07, |
|
"loss": 1.5128, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.2415777704461307, |
|
"grad_norm": 2.2183441842361753, |
|
"learning_rate": 8.642852204865243e-07, |
|
"loss": 1.5409, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 0.24194435279885626, |
|
"grad_norm": 2.270638521627112, |
|
"learning_rate": 8.638898296470953e-07, |
|
"loss": 1.4992, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.2423109351515818, |
|
"grad_norm": 2.6732843475957146, |
|
"learning_rate": 8.634939544053279e-07, |
|
"loss": 1.5335, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 0.24267751750430736, |
|
"grad_norm": 1.9291920434342291, |
|
"learning_rate": 8.630975952882026e-07, |
|
"loss": 1.4627, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.24304409985703287, |
|
"grad_norm": 2.05169281240212, |
|
"learning_rate": 8.627007528233445e-07, |
|
"loss": 1.5257, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.24341068220975842, |
|
"grad_norm": 2.42497111676382, |
|
"learning_rate": 8.623034275390214e-07, |
|
"loss": 1.5445, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.24377726456248397, |
|
"grad_norm": 2.1919485638499903, |
|
"learning_rate": 8.619056199641444e-07, |
|
"loss": 1.5115, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.2441438469152095, |
|
"grad_norm": 2.3664261903908343, |
|
"learning_rate": 8.615073306282663e-07, |
|
"loss": 1.4846, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.24451042926793504, |
|
"grad_norm": 2.7278440906317387, |
|
"learning_rate": 8.611085600615812e-07, |
|
"loss": 1.5419, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 0.24487701162066058, |
|
"grad_norm": 2.326361941668607, |
|
"learning_rate": 8.607093087949244e-07, |
|
"loss": 1.5447, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.24524359397338613, |
|
"grad_norm": 2.101465809666948, |
|
"learning_rate": 8.603095773597702e-07, |
|
"loss": 1.5147, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 0.24561017632611165, |
|
"grad_norm": 2.121131443755951, |
|
"learning_rate": 8.599093662882326e-07, |
|
"loss": 1.5046, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.2459767586788372, |
|
"grad_norm": 2.004374535392673, |
|
"learning_rate": 8.595086761130641e-07, |
|
"loss": 1.5104, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 0.24634334103156275, |
|
"grad_norm": 2.330571487353144, |
|
"learning_rate": 8.591075073676548e-07, |
|
"loss": 1.489, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.2467099233842883, |
|
"grad_norm": 1.954097712061658, |
|
"learning_rate": 8.587058605860319e-07, |
|
"loss": 1.4628, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 0.2470765057370138, |
|
"grad_norm": 2.287871494329092, |
|
"learning_rate": 8.583037363028591e-07, |
|
"loss": 1.4966, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.24744308808973936, |
|
"grad_norm": 2.2507921472351837, |
|
"learning_rate": 8.579011350534355e-07, |
|
"loss": 1.5148, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.2478096704424649, |
|
"grad_norm": 2.2811051866364034, |
|
"learning_rate": 8.574980573736951e-07, |
|
"loss": 1.5123, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.24817625279519043, |
|
"grad_norm": 2.0762345472822106, |
|
"learning_rate": 8.570945038002066e-07, |
|
"loss": 1.5538, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 0.24854283514791597, |
|
"grad_norm": 2.0481616873032618, |
|
"learning_rate": 8.566904748701718e-07, |
|
"loss": 1.5162, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.24890941750064152, |
|
"grad_norm": 1.977911548805274, |
|
"learning_rate": 8.562859711214252e-07, |
|
"loss": 1.4945, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 0.24927599985336707, |
|
"grad_norm": 2.166946374211255, |
|
"learning_rate": 8.558809930924336e-07, |
|
"loss": 1.5143, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.2496425822060926, |
|
"grad_norm": 2.265635068798512, |
|
"learning_rate": 8.554755413222952e-07, |
|
"loss": 1.5079, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 0.25000916455881816, |
|
"grad_norm": 2.376856602321205, |
|
"learning_rate": 8.550696163507384e-07, |
|
"loss": 1.5187, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.2503757469115437, |
|
"grad_norm": 2.329411952961872, |
|
"learning_rate": 8.54663218718122e-07, |
|
"loss": 1.4985, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 0.2507423292642692, |
|
"grad_norm": 2.127867609490789, |
|
"learning_rate": 8.542563489654337e-07, |
|
"loss": 1.5249, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.2511089116169948, |
|
"grad_norm": 2.3846188422530545, |
|
"learning_rate": 8.5384900763429e-07, |
|
"loss": 1.5157, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.2514754939697203, |
|
"grad_norm": 1.9837481727043949, |
|
"learning_rate": 8.534411952669348e-07, |
|
"loss": 1.5185, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.2518420763224458, |
|
"grad_norm": 2.0300743472877776, |
|
"learning_rate": 8.530329124062392e-07, |
|
"loss": 1.4726, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 0.2522086586751714, |
|
"grad_norm": 3.41153757527899, |
|
"learning_rate": 8.526241595957007e-07, |
|
"loss": 1.482, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.2525752410278969, |
|
"grad_norm": 2.7170854102243043, |
|
"learning_rate": 8.52214937379442e-07, |
|
"loss": 1.4518, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 0.25294182338062243, |
|
"grad_norm": 2.5040883653748294, |
|
"learning_rate": 8.518052463022112e-07, |
|
"loss": 1.4506, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.253308405733348, |
|
"grad_norm": 2.1362380301717807, |
|
"learning_rate": 8.513950869093802e-07, |
|
"loss": 1.4975, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 0.2536749880860735, |
|
"grad_norm": 56.61497948468882, |
|
"learning_rate": 8.509844597469442e-07, |
|
"loss": 1.5211, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.2540415704387991, |
|
"grad_norm": 2.161248343347086, |
|
"learning_rate": 8.505733653615217e-07, |
|
"loss": 1.5123, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 0.2544081527915246, |
|
"grad_norm": 2.197831076147601, |
|
"learning_rate": 8.501618043003522e-07, |
|
"loss": 1.4735, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.25477473514425014, |
|
"grad_norm": 2.730731478650521, |
|
"learning_rate": 8.497497771112975e-07, |
|
"loss": 1.5154, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.2551413174969757, |
|
"grad_norm": 2.625261843658038, |
|
"learning_rate": 8.49337284342839e-07, |
|
"loss": 1.4642, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.25550789984970124, |
|
"grad_norm": 3.6302229703502302, |
|
"learning_rate": 8.489243265440785e-07, |
|
"loss": 1.4339, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 0.25587448220242676, |
|
"grad_norm": 2.2912655831406408, |
|
"learning_rate": 8.485109042647361e-07, |
|
"loss": 1.5021, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.25624106455515233, |
|
"grad_norm": 8.005970124630041, |
|
"learning_rate": 8.48097018055151e-07, |
|
"loss": 1.4777, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 0.25660764690787785, |
|
"grad_norm": 2.2515437376163097, |
|
"learning_rate": 8.476826684662797e-07, |
|
"loss": 1.5096, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.25660764690787785, |
|
"eval_accuracy": 0.6611285662580546, |
|
"eval_loss": 1.4870213270187378, |
|
"eval_runtime": 310.8369, |
|
"eval_samples_per_second": 10.639, |
|
"eval_steps_per_second": 0.888, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.25697422926060337, |
|
"grad_norm": 2.531506922529387, |
|
"learning_rate": 8.472678560496955e-07, |
|
"loss": 1.4718, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 0.25734081161332895, |
|
"grad_norm": 2.6738422568666778, |
|
"learning_rate": 8.468525813575875e-07, |
|
"loss": 1.4849, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.25770739396605447, |
|
"grad_norm": 2.3045631257315256, |
|
"learning_rate": 8.464368449427608e-07, |
|
"loss": 1.3982, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 0.25807397631878004, |
|
"grad_norm": 2.3127941331475586, |
|
"learning_rate": 8.460206473586347e-07, |
|
"loss": 1.4584, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.25844055867150556, |
|
"grad_norm": 2.624025522294039, |
|
"learning_rate": 8.456039891592424e-07, |
|
"loss": 1.5064, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.2588071410242311, |
|
"grad_norm": 2.4392755048359906, |
|
"learning_rate": 8.451868708992305e-07, |
|
"loss": 1.4744, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.25917372337695666, |
|
"grad_norm": 2.244873049339989, |
|
"learning_rate": 8.447692931338577e-07, |
|
"loss": 1.4866, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 0.2595403057296822, |
|
"grad_norm": 2.7693601328533846, |
|
"learning_rate": 8.443512564189947e-07, |
|
"loss": 1.4264, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.2599068880824077, |
|
"grad_norm": 2.18123288795935, |
|
"learning_rate": 8.439327613111231e-07, |
|
"loss": 1.4487, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 0.26027347043513327, |
|
"grad_norm": 2.770780437192883, |
|
"learning_rate": 8.435138083673343e-07, |
|
"loss": 1.5298, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.2606400527878588, |
|
"grad_norm": 2.2581904540642737, |
|
"learning_rate": 8.430943981453298e-07, |
|
"loss": 1.4801, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 0.2610066351405843, |
|
"grad_norm": 2.3222299759291674, |
|
"learning_rate": 8.426745312034192e-07, |
|
"loss": 1.4896, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.2613732174933099, |
|
"grad_norm": 2.0280868196158908, |
|
"learning_rate": 8.422542081005209e-07, |
|
"loss": 1.4466, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 0.2617397998460354, |
|
"grad_norm": 2.224282133830904, |
|
"learning_rate": 8.418334293961593e-07, |
|
"loss": 1.5286, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.262106382198761, |
|
"grad_norm": 2.223919368251033, |
|
"learning_rate": 8.414121956504665e-07, |
|
"loss": 1.5043, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.2624729645514865, |
|
"grad_norm": 2.505467964910925, |
|
"learning_rate": 8.409905074241796e-07, |
|
"loss": 1.4781, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.262839546904212, |
|
"grad_norm": 2.0986445187287077, |
|
"learning_rate": 8.405683652786411e-07, |
|
"loss": 1.4804, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 0.2632061292569376, |
|
"grad_norm": 2.490412539205642, |
|
"learning_rate": 8.401457697757972e-07, |
|
"loss": 1.518, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.2635727116096631, |
|
"grad_norm": 2.6915376209294917, |
|
"learning_rate": 8.397227214781983e-07, |
|
"loss": 1.4812, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 0.26393929396238863, |
|
"grad_norm": 2.3046153435535235, |
|
"learning_rate": 8.392992209489973e-07, |
|
"loss": 1.5159, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.2643058763151142, |
|
"grad_norm": 2.508127660367551, |
|
"learning_rate": 8.388752687519489e-07, |
|
"loss": 1.4451, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 0.2646724586678397, |
|
"grad_norm": 3.1862145718553245, |
|
"learning_rate": 8.384508654514091e-07, |
|
"loss": 1.4609, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.26503904102056525, |
|
"grad_norm": 2.5580838478505803, |
|
"learning_rate": 8.380260116123343e-07, |
|
"loss": 1.4331, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 0.2654056233732908, |
|
"grad_norm": 2.257862509636175, |
|
"learning_rate": 8.376007078002813e-07, |
|
"loss": 1.45, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.26577220572601634, |
|
"grad_norm": 2.288080123372639, |
|
"learning_rate": 8.371749545814051e-07, |
|
"loss": 1.4389, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.2661387880787419, |
|
"grad_norm": 2.396647723381076, |
|
"learning_rate": 8.367487525224592e-07, |
|
"loss": 1.4366, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.26650537043146744, |
|
"grad_norm": 2.2979084143372868, |
|
"learning_rate": 8.363221021907949e-07, |
|
"loss": 1.4818, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 0.26687195278419296, |
|
"grad_norm": 2.1808515998354694, |
|
"learning_rate": 8.358950041543598e-07, |
|
"loss": 1.4542, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.26723853513691853, |
|
"grad_norm": 2.230268806261455, |
|
"learning_rate": 8.354674589816977e-07, |
|
"loss": 1.4329, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 0.26760511748964405, |
|
"grad_norm": 2.927648869466954, |
|
"learning_rate": 8.350394672419474e-07, |
|
"loss": 1.5225, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.26797169984236957, |
|
"grad_norm": 2.112114910370922, |
|
"learning_rate": 8.346110295048425e-07, |
|
"loss": 1.4225, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 0.26833828219509515, |
|
"grad_norm": 2.660467378126346, |
|
"learning_rate": 8.341821463407101e-07, |
|
"loss": 1.5031, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.26870486454782067, |
|
"grad_norm": 3.003354330326063, |
|
"learning_rate": 8.337528183204704e-07, |
|
"loss": 1.4707, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 0.2690714469005462, |
|
"grad_norm": 2.623779251977545, |
|
"learning_rate": 8.333230460156355e-07, |
|
"loss": 1.4794, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.26943802925327176, |
|
"grad_norm": 3.101895766048754, |
|
"learning_rate": 8.32892829998309e-07, |
|
"loss": 1.4667, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.2698046116059973, |
|
"grad_norm": 2.960369047027641, |
|
"learning_rate": 8.324621708411854e-07, |
|
"loss": 1.5522, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.2701711939587228, |
|
"grad_norm": 2.524100342925903, |
|
"learning_rate": 8.320310691175489e-07, |
|
"loss": 1.4526, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 0.2705377763114484, |
|
"grad_norm": 2.62363195310582, |
|
"learning_rate": 8.315995254012726e-07, |
|
"loss": 1.4018, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.2709043586641739, |
|
"grad_norm": 1.9920146887682115, |
|
"learning_rate": 8.311675402668188e-07, |
|
"loss": 1.3965, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 0.27127094101689947, |
|
"grad_norm": 2.18110821192289, |
|
"learning_rate": 8.307351142892364e-07, |
|
"loss": 1.4842, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.271637523369625, |
|
"grad_norm": 2.2188567896520497, |
|
"learning_rate": 8.303022480441617e-07, |
|
"loss": 1.4159, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 0.2720041057223505, |
|
"grad_norm": 2.858166839750072, |
|
"learning_rate": 8.298689421078171e-07, |
|
"loss": 1.3954, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.2723706880750761, |
|
"grad_norm": 2.740212521082454, |
|
"learning_rate": 8.294351970570099e-07, |
|
"loss": 1.4861, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 0.2727372704278016, |
|
"grad_norm": 3.419233012340433, |
|
"learning_rate": 8.290010134691326e-07, |
|
"loss": 1.4824, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.2731038527805271, |
|
"grad_norm": 2.4809215592986966, |
|
"learning_rate": 8.285663919221606e-07, |
|
"loss": 1.4938, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.2734704351332527, |
|
"grad_norm": 2.607478119047904, |
|
"learning_rate": 8.281313329946531e-07, |
|
"loss": 1.419, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.2738370174859782, |
|
"grad_norm": 2.8279213303777753, |
|
"learning_rate": 8.276958372657512e-07, |
|
"loss": 1.4801, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 0.27420359983870374, |
|
"grad_norm": 2.585541966605194, |
|
"learning_rate": 8.272599053151774e-07, |
|
"loss": 1.4154, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.2745701821914293, |
|
"grad_norm": 2.7236239018595336, |
|
"learning_rate": 8.268235377232351e-07, |
|
"loss": 1.4741, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 0.27493676454415483, |
|
"grad_norm": 2.2739375571211844, |
|
"learning_rate": 8.263867350708072e-07, |
|
"loss": 1.4447, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.2753033468968804, |
|
"grad_norm": 2.936703619541737, |
|
"learning_rate": 8.259494979393562e-07, |
|
"loss": 1.4811, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 0.2756699292496059, |
|
"grad_norm": 2.644051786280347, |
|
"learning_rate": 8.255118269109229e-07, |
|
"loss": 1.4359, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.27603651160233145, |
|
"grad_norm": 2.814370164816269, |
|
"learning_rate": 8.250737225681254e-07, |
|
"loss": 1.4697, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 0.276403093955057, |
|
"grad_norm": 2.7487477516640664, |
|
"learning_rate": 8.246351854941589e-07, |
|
"loss": 1.4677, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.27676967630778254, |
|
"grad_norm": 2.7840690479403807, |
|
"learning_rate": 8.241962162727946e-07, |
|
"loss": 1.462, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.27713625866050806, |
|
"grad_norm": 2.9784690105392366, |
|
"learning_rate": 8.237568154883788e-07, |
|
"loss": 1.4439, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.27750284101323364, |
|
"grad_norm": 2.8948634927350105, |
|
"learning_rate": 8.233169837258325e-07, |
|
"loss": 1.4705, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 0.27786942336595916, |
|
"grad_norm": 2.612491147603324, |
|
"learning_rate": 8.228767215706503e-07, |
|
"loss": 1.467, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.2782360057186847, |
|
"grad_norm": 2.8002040163179736, |
|
"learning_rate": 8.224360296088995e-07, |
|
"loss": 1.4573, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.27860258807141025, |
|
"grad_norm": 2.8029823959562155, |
|
"learning_rate": 8.219949084272201e-07, |
|
"loss": 1.4804, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.27896917042413577, |
|
"grad_norm": 2.6888372781846375, |
|
"learning_rate": 8.21553358612823e-07, |
|
"loss": 1.4633, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 0.27933575277686135, |
|
"grad_norm": 2.279721839418087, |
|
"learning_rate": 8.2111138075349e-07, |
|
"loss": 1.4713, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.27970233512958687, |
|
"grad_norm": 2.3829035564919807, |
|
"learning_rate": 8.206689754375724e-07, |
|
"loss": 1.4387, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 0.2800689174823124, |
|
"grad_norm": 3.7962407630882384, |
|
"learning_rate": 8.202261432539907e-07, |
|
"loss": 1.4025, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.28043549983503796, |
|
"grad_norm": 2.797043930833034, |
|
"learning_rate": 8.197828847922337e-07, |
|
"loss": 1.4576, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.2808020821877635, |
|
"grad_norm": 3.256545613051792, |
|
"learning_rate": 8.193392006423574e-07, |
|
"loss": 1.432, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.281168664540489, |
|
"grad_norm": 2.432668523438971, |
|
"learning_rate": 8.188950913949848e-07, |
|
"loss": 1.456, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 0.2815352468932146, |
|
"grad_norm": 2.4546993774133856, |
|
"learning_rate": 8.184505576413043e-07, |
|
"loss": 1.392, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.2819018292459401, |
|
"grad_norm": 3.0030506631971776, |
|
"learning_rate": 8.180055999730702e-07, |
|
"loss": 1.365, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 0.2822684115986656, |
|
"grad_norm": 2.9439493487762465, |
|
"learning_rate": 8.175602189826001e-07, |
|
"loss": 1.4292, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.2826349939513912, |
|
"grad_norm": 2.620909787731563, |
|
"learning_rate": 8.171144152627761e-07, |
|
"loss": 1.4251, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 0.2830015763041167, |
|
"grad_norm": 3.263683256322055, |
|
"learning_rate": 8.16668189407042e-07, |
|
"loss": 1.3899, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.2833681586568423, |
|
"grad_norm": 2.5437523385064953, |
|
"learning_rate": 8.162215420094045e-07, |
|
"loss": 1.3683, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 0.2837347410095678, |
|
"grad_norm": 2.4580551613838844, |
|
"learning_rate": 8.15774473664431e-07, |
|
"loss": 1.3732, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.2841013233622933, |
|
"grad_norm": 2.8279077970597184, |
|
"learning_rate": 8.153269849672493e-07, |
|
"loss": 1.419, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.2844679057150189, |
|
"grad_norm": 3.041958703900493, |
|
"learning_rate": 8.148790765135465e-07, |
|
"loss": 1.4356, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.2848344880677444, |
|
"grad_norm": 2.4582661578514426, |
|
"learning_rate": 8.144307488995689e-07, |
|
"loss": 1.4378, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 0.28520107042046994, |
|
"grad_norm": 2.8361019596271726, |
|
"learning_rate": 8.139820027221208e-07, |
|
"loss": 1.4111, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.2855676527731955, |
|
"grad_norm": 2.4415137770737427, |
|
"learning_rate": 8.135328385785631e-07, |
|
"loss": 1.4996, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 0.28593423512592103, |
|
"grad_norm": 2.1392002967653094, |
|
"learning_rate": 8.130832570668139e-07, |
|
"loss": 1.433, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.28630081747864655, |
|
"grad_norm": 3.061322031102369, |
|
"learning_rate": 8.126332587853462e-07, |
|
"loss": 1.4051, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 0.2866673998313721, |
|
"grad_norm": 3.2748819767509354, |
|
"learning_rate": 8.12182844333188e-07, |
|
"loss": 1.3863, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.28703398218409765, |
|
"grad_norm": 3.1866933217967603, |
|
"learning_rate": 8.117320143099216e-07, |
|
"loss": 1.4173, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 0.2874005645368232, |
|
"grad_norm": 2.9290211285749175, |
|
"learning_rate": 8.11280769315682e-07, |
|
"loss": 1.4395, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.28776714688954874, |
|
"grad_norm": 2.7212160772193474, |
|
"learning_rate": 8.108291099511571e-07, |
|
"loss": 1.4503, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.28813372924227426, |
|
"grad_norm": 2.3892746869258317, |
|
"learning_rate": 8.10377036817586e-07, |
|
"loss": 1.4368, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.28850031159499984, |
|
"grad_norm": 3.4107926691510277, |
|
"learning_rate": 8.099245505167589e-07, |
|
"loss": 1.4623, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 0.28886689394772536, |
|
"grad_norm": 3.1259277735027307, |
|
"learning_rate": 8.094716516510156e-07, |
|
"loss": 1.4412, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.2892334763004509, |
|
"grad_norm": 2.9135343767151154, |
|
"learning_rate": 8.090183408232459e-07, |
|
"loss": 1.4187, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 0.28960005865317645, |
|
"grad_norm": 3.30617041516701, |
|
"learning_rate": 8.085646186368867e-07, |
|
"loss": 1.4176, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.28996664100590197, |
|
"grad_norm": 3.1801194693312556, |
|
"learning_rate": 8.081104856959238e-07, |
|
"loss": 1.4534, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 0.2903332233586275, |
|
"grad_norm": 3.2431476470574983, |
|
"learning_rate": 8.07655942604889e-07, |
|
"loss": 1.3469, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.29069980571135307, |
|
"grad_norm": 3.1005913247685237, |
|
"learning_rate": 8.072009899688605e-07, |
|
"loss": 1.417, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 0.2910663880640786, |
|
"grad_norm": 2.953054099149132, |
|
"learning_rate": 8.067456283934614e-07, |
|
"loss": 1.4252, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.29143297041680416, |
|
"grad_norm": 2.6363992565855803, |
|
"learning_rate": 8.062898584848592e-07, |
|
"loss": 1.4499, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.2917995527695297, |
|
"grad_norm": 2.7290690238502635, |
|
"learning_rate": 8.05833680849765e-07, |
|
"loss": 1.4716, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.2921661351222552, |
|
"grad_norm": 3.21591143424738, |
|
"learning_rate": 8.053770960954328e-07, |
|
"loss": 1.3969, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 0.2925327174749808, |
|
"grad_norm": 3.8732639515812575, |
|
"learning_rate": 8.049201048296585e-07, |
|
"loss": 1.463, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.2928992998277063, |
|
"grad_norm": 2.9966394441630126, |
|
"learning_rate": 8.044627076607789e-07, |
|
"loss": 1.4545, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 0.2932658821804318, |
|
"grad_norm": 3.1577560282041017, |
|
"learning_rate": 8.040049051976713e-07, |
|
"loss": 1.4682, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2932658821804318, |
|
"eval_accuracy": 0.6739903313977985, |
|
"eval_loss": 1.4271955490112305, |
|
"eval_runtime": 311.2156, |
|
"eval_samples_per_second": 10.626, |
|
"eval_steps_per_second": 0.887, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2936324645331574, |
|
"grad_norm": 2.957786000444244, |
|
"learning_rate": 8.035466980497526e-07, |
|
"loss": 1.4592, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 0.2939990468858829, |
|
"grad_norm": 2.765279941343725, |
|
"learning_rate": 8.030880868269785e-07, |
|
"loss": 1.4404, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 0.29436562923860843, |
|
"grad_norm": 2.803405395861366, |
|
"learning_rate": 8.026290721398421e-07, |
|
"loss": 1.3642, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 0.294732211591334, |
|
"grad_norm": 3.134947642226663, |
|
"learning_rate": 8.02169654599374e-07, |
|
"loss": 1.4662, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.2950987939440595, |
|
"grad_norm": 3.3888445829207923, |
|
"learning_rate": 8.017098348171411e-07, |
|
"loss": 1.4092, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.2954653762967851, |
|
"grad_norm": 2.595961601811049, |
|
"learning_rate": 8.012496134052457e-07, |
|
"loss": 1.3772, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 0.2958319586495106, |
|
"grad_norm": 3.724884065568925, |
|
"learning_rate": 8.007889909763246e-07, |
|
"loss": 1.3862, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 0.29619854100223614, |
|
"grad_norm": 3.6608857589920754, |
|
"learning_rate": 8.003279681435482e-07, |
|
"loss": 1.444, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.2965651233549617, |
|
"grad_norm": 2.7154240671865213, |
|
"learning_rate": 7.998665455206206e-07, |
|
"loss": 1.4285, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 0.29693170570768723, |
|
"grad_norm": 2.7151538150939927, |
|
"learning_rate": 7.994047237217776e-07, |
|
"loss": 1.4489, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.29729828806041275, |
|
"grad_norm": 2.9729575587995742, |
|
"learning_rate": 7.989425033617863e-07, |
|
"loss": 1.4289, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 0.2976648704131383, |
|
"grad_norm": 3.298808013574498, |
|
"learning_rate": 7.984798850559447e-07, |
|
"loss": 1.4607, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.29803145276586385, |
|
"grad_norm": 3.1491445672684866, |
|
"learning_rate": 7.980168694200804e-07, |
|
"loss": 1.4097, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 0.29839803511858937, |
|
"grad_norm": 3.6399703354293007, |
|
"learning_rate": 7.975534570705497e-07, |
|
"loss": 1.3743, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 0.29876461747131494, |
|
"grad_norm": 3.2547493183004974, |
|
"learning_rate": 7.970896486242374e-07, |
|
"loss": 1.4346, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.29913119982404046, |
|
"grad_norm": 3.421650269839234, |
|
"learning_rate": 7.966254446985553e-07, |
|
"loss": 1.43, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.29949778217676604, |
|
"grad_norm": 3.797293850962011, |
|
"learning_rate": 7.961608459114416e-07, |
|
"loss": 1.4651, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 0.29986436452949156, |
|
"grad_norm": 3.5920498224364508, |
|
"learning_rate": 7.956958528813604e-07, |
|
"loss": 1.3738, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 0.3002309468822171, |
|
"grad_norm": 3.238482918382144, |
|
"learning_rate": 7.952304662273003e-07, |
|
"loss": 1.3987, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 0.30059752923494265, |
|
"grad_norm": 2.7498611423368176, |
|
"learning_rate": 7.947646865687742e-07, |
|
"loss": 1.4181, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.30096411158766817, |
|
"grad_norm": 4.031428344222072, |
|
"learning_rate": 7.942985145258179e-07, |
|
"loss": 1.4294, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 0.3013306939403937, |
|
"grad_norm": 2.643218639323195, |
|
"learning_rate": 7.938319507189894e-07, |
|
"loss": 1.4302, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.30169727629311927, |
|
"grad_norm": 3.1275133100531227, |
|
"learning_rate": 7.933649957693689e-07, |
|
"loss": 1.348, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 0.3020638586458448, |
|
"grad_norm": 3.521399879217592, |
|
"learning_rate": 7.928976502985565e-07, |
|
"loss": 1.4328, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.3024304409985703, |
|
"grad_norm": 3.1834120547065665, |
|
"learning_rate": 7.924299149286725e-07, |
|
"loss": 1.4742, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.3027970233512959, |
|
"grad_norm": 3.631213709741295, |
|
"learning_rate": 7.919617902823563e-07, |
|
"loss": 1.4068, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 0.3031636057040214, |
|
"grad_norm": 2.726938578010126, |
|
"learning_rate": 7.914932769827653e-07, |
|
"loss": 1.4359, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 0.303530188056747, |
|
"grad_norm": 3.7017959652425882, |
|
"learning_rate": 7.910243756535744e-07, |
|
"loss": 1.3344, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.3038967704094725, |
|
"grad_norm": 3.3417669291832066, |
|
"learning_rate": 7.90555086918975e-07, |
|
"loss": 1.4121, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 0.304263352762198, |
|
"grad_norm": 2.733351967687222, |
|
"learning_rate": 7.900854114036743e-07, |
|
"loss": 1.3732, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.3046299351149236, |
|
"grad_norm": 3.1756478835337476, |
|
"learning_rate": 7.89615349732894e-07, |
|
"loss": 1.4007, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 0.3049965174676491, |
|
"grad_norm": 3.238758242953075, |
|
"learning_rate": 7.891449025323703e-07, |
|
"loss": 1.4288, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.30536309982037463, |
|
"grad_norm": 2.6053607033892043, |
|
"learning_rate": 7.886740704283525e-07, |
|
"loss": 1.4156, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 0.3057296821731002, |
|
"grad_norm": 3.4053915363354417, |
|
"learning_rate": 7.88202854047602e-07, |
|
"loss": 1.3763, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.3060962645258257, |
|
"grad_norm": 3.715425460301463, |
|
"learning_rate": 7.877312540173922e-07, |
|
"loss": 1.4036, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.30646284687855124, |
|
"grad_norm": 2.9427971805533697, |
|
"learning_rate": 7.872592709655066e-07, |
|
"loss": 1.4385, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.3068294292312768, |
|
"grad_norm": 3.5845846532616426, |
|
"learning_rate": 7.867869055202392e-07, |
|
"loss": 1.415, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 0.30719601158400234, |
|
"grad_norm": 3.331222139254396, |
|
"learning_rate": 7.863141583103927e-07, |
|
"loss": 1.4126, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 0.3075625939367279, |
|
"grad_norm": 3.1984388430808406, |
|
"learning_rate": 7.85841029965278e-07, |
|
"loss": 1.3826, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 0.30792917628945343, |
|
"grad_norm": 3.1255012278404615, |
|
"learning_rate": 7.853675211147134e-07, |
|
"loss": 1.383, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.30829575864217895, |
|
"grad_norm": 3.329583698840508, |
|
"learning_rate": 7.848936323890239e-07, |
|
"loss": 1.3931, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 0.3086623409949045, |
|
"grad_norm": 3.9347250968462055, |
|
"learning_rate": 7.844193644190396e-07, |
|
"loss": 1.415, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 0.30902892334763005, |
|
"grad_norm": 4.137255951707039, |
|
"learning_rate": 7.839447178360963e-07, |
|
"loss": 1.3998, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 0.30939550570035557, |
|
"grad_norm": 2.6794621566293917, |
|
"learning_rate": 7.834696932720331e-07, |
|
"loss": 1.4228, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.30976208805308114, |
|
"grad_norm": 2.726588078339754, |
|
"learning_rate": 7.829942913591925e-07, |
|
"loss": 1.4486, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.31012867040580666, |
|
"grad_norm": 3.6162463016794026, |
|
"learning_rate": 7.825185127304194e-07, |
|
"loss": 1.4051, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.3104952527585322, |
|
"grad_norm": 2.910711368055256, |
|
"learning_rate": 7.820423580190603e-07, |
|
"loss": 1.41, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 0.31086183511125776, |
|
"grad_norm": 4.136385316326493, |
|
"learning_rate": 7.815658278589619e-07, |
|
"loss": 1.3859, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.3112284174639833, |
|
"grad_norm": 2.1538443576824404, |
|
"learning_rate": 7.810889228844708e-07, |
|
"loss": 1.4113, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 0.31159499981670885, |
|
"grad_norm": 3.1055419264140727, |
|
"learning_rate": 7.806116437304331e-07, |
|
"loss": 1.4327, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.31196158216943437, |
|
"grad_norm": 3.183052960747229, |
|
"learning_rate": 7.801339910321922e-07, |
|
"loss": 1.4179, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 0.3123281645221599, |
|
"grad_norm": 4.6955784323633925, |
|
"learning_rate": 7.796559654255894e-07, |
|
"loss": 1.3961, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.31269474687488547, |
|
"grad_norm": 3.227174794853267, |
|
"learning_rate": 7.79177567546962e-07, |
|
"loss": 1.4082, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 0.313061329227611, |
|
"grad_norm": 2.8264595214995243, |
|
"learning_rate": 7.78698798033143e-07, |
|
"loss": 1.4136, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 0.3134279115803365, |
|
"grad_norm": 3.7915043909577624, |
|
"learning_rate": 7.782196575214601e-07, |
|
"loss": 1.3758, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.3137944939330621, |
|
"grad_norm": 4.070976938559408, |
|
"learning_rate": 7.777401466497349e-07, |
|
"loss": 1.3915, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.3141610762857876, |
|
"grad_norm": 3.3538502722425916, |
|
"learning_rate": 7.772602660562819e-07, |
|
"loss": 1.3718, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 0.3145276586385131, |
|
"grad_norm": 3.230342363406807, |
|
"learning_rate": 7.767800163799081e-07, |
|
"loss": 1.3408, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.3148942409912387, |
|
"grad_norm": 3.6144160833487415, |
|
"learning_rate": 7.762993982599113e-07, |
|
"loss": 1.4296, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 0.3152608233439642, |
|
"grad_norm": 3.1182771552970374, |
|
"learning_rate": 7.758184123360803e-07, |
|
"loss": 1.3858, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.3156274056966898, |
|
"grad_norm": 3.5319206230022977, |
|
"learning_rate": 7.75337059248693e-07, |
|
"loss": 1.4342, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 0.3159939880494153, |
|
"grad_norm": 4.327639493570607, |
|
"learning_rate": 7.748553396385163e-07, |
|
"loss": 1.3915, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 0.31636057040214083, |
|
"grad_norm": 3.9982142503751326, |
|
"learning_rate": 7.743732541468053e-07, |
|
"loss": 1.363, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 0.3167271527548664, |
|
"grad_norm": 2.8786530129074728, |
|
"learning_rate": 7.738908034153015e-07, |
|
"loss": 1.3589, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.3170937351075919, |
|
"grad_norm": 4.4947342914569095, |
|
"learning_rate": 7.734079880862333e-07, |
|
"loss": 1.3506, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.31746031746031744, |
|
"grad_norm": 3.1518608629753477, |
|
"learning_rate": 7.729248088023139e-07, |
|
"loss": 1.3847, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 0.317826899813043, |
|
"grad_norm": 3.8964914548994534, |
|
"learning_rate": 7.724412662067415e-07, |
|
"loss": 1.3616, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 0.31819348216576854, |
|
"grad_norm": 4.158332163473049, |
|
"learning_rate": 7.719573609431971e-07, |
|
"loss": 1.3477, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.31856006451849406, |
|
"grad_norm": 5.31244346458908, |
|
"learning_rate": 7.714730936558455e-07, |
|
"loss": 1.3885, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 0.31892664687121963, |
|
"grad_norm": 3.5750048314109946, |
|
"learning_rate": 7.709884649893328e-07, |
|
"loss": 1.3763, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.31929322922394515, |
|
"grad_norm": 3.5013927398683444, |
|
"learning_rate": 7.70503475588786e-07, |
|
"loss": 1.3437, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 0.31965981157667067, |
|
"grad_norm": 3.772854937898392, |
|
"learning_rate": 7.700181260998131e-07, |
|
"loss": 1.434, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.32002639392939625, |
|
"grad_norm": 3.939247516045474, |
|
"learning_rate": 7.695324171685004e-07, |
|
"loss": 1.384, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 0.32039297628212177, |
|
"grad_norm": 3.3160045433400334, |
|
"learning_rate": 7.690463494414137e-07, |
|
"loss": 1.3681, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.32075955863484734, |
|
"grad_norm": 3.2760601494452533, |
|
"learning_rate": 7.685599235655955e-07, |
|
"loss": 1.3576, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.32112614098757286, |
|
"grad_norm": 3.917398028616676, |
|
"learning_rate": 7.680731401885658e-07, |
|
"loss": 1.4109, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.3214927233402984, |
|
"grad_norm": 4.3801775022523355, |
|
"learning_rate": 7.675859999583202e-07, |
|
"loss": 1.3688, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 0.32185930569302396, |
|
"grad_norm": 3.52546033919284, |
|
"learning_rate": 7.670985035233291e-07, |
|
"loss": 1.3803, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 0.3222258880457495, |
|
"grad_norm": 3.4568824402601925, |
|
"learning_rate": 7.666106515325374e-07, |
|
"loss": 1.3615, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 0.322592470398475, |
|
"grad_norm": 2.7983015500958826, |
|
"learning_rate": 7.661224446353634e-07, |
|
"loss": 1.3767, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.32295905275120057, |
|
"grad_norm": 3.4581919245368904, |
|
"learning_rate": 7.656338834816976e-07, |
|
"loss": 1.3768, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 0.3233256351039261, |
|
"grad_norm": 3.7176544154346054, |
|
"learning_rate": 7.651449687219018e-07, |
|
"loss": 1.3312, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.3236922174566516, |
|
"grad_norm": 3.6712040176600502, |
|
"learning_rate": 7.646557010068091e-07, |
|
"loss": 1.3981, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 0.3240587998093772, |
|
"grad_norm": 2.8962404949789637, |
|
"learning_rate": 7.641660809877222e-07, |
|
"loss": 1.4085, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.3244253821621027, |
|
"grad_norm": 5.2069626245172635, |
|
"learning_rate": 7.636761093164126e-07, |
|
"loss": 1.3489, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.3247919645148283, |
|
"grad_norm": 3.3614052591604793, |
|
"learning_rate": 7.631857866451204e-07, |
|
"loss": 1.391, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 0.3251585468675538, |
|
"grad_norm": 3.1183008582079417, |
|
"learning_rate": 7.626951136265523e-07, |
|
"loss": 1.3966, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 0.3255251292202793, |
|
"grad_norm": 4.337276600886146, |
|
"learning_rate": 7.622040909138818e-07, |
|
"loss": 1.3566, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.3258917115730049, |
|
"grad_norm": 4.083650404603487, |
|
"learning_rate": 7.617127191607479e-07, |
|
"loss": 1.3928, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 0.3262582939257304, |
|
"grad_norm": 3.847428171873619, |
|
"learning_rate": 7.612209990212543e-07, |
|
"loss": 1.3259, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.32662487627845593, |
|
"grad_norm": 3.2197146488177384, |
|
"learning_rate": 7.607289311499678e-07, |
|
"loss": 1.376, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 0.3269914586311815, |
|
"grad_norm": 3.4983962191005373, |
|
"learning_rate": 7.60236516201919e-07, |
|
"loss": 1.3927, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 0.32735804098390703, |
|
"grad_norm": 3.610610377134006, |
|
"learning_rate": 7.597437548326002e-07, |
|
"loss": 1.3792, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 0.32772462333663255, |
|
"grad_norm": 5.095826376758547, |
|
"learning_rate": 7.592506476979644e-07, |
|
"loss": 1.358, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 0.3280912056893581, |
|
"grad_norm": 3.3863305431901183, |
|
"learning_rate": 7.587571954544254e-07, |
|
"loss": 1.3983, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.32845778804208364, |
|
"grad_norm": 3.5975350890244067, |
|
"learning_rate": 7.582633987588563e-07, |
|
"loss": 1.4057, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 0.3288243703948092, |
|
"grad_norm": 3.848485096118636, |
|
"learning_rate": 7.577692582685886e-07, |
|
"loss": 1.3814, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 0.32919095274753474, |
|
"grad_norm": 3.157404479059578, |
|
"learning_rate": 7.572747746414117e-07, |
|
"loss": 1.4095, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 0.32955753510026026, |
|
"grad_norm": 4.1043127446716285, |
|
"learning_rate": 7.567799485355715e-07, |
|
"loss": 1.3755, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 0.32992411745298583, |
|
"grad_norm": 3.7156219870736615, |
|
"learning_rate": 7.562847806097696e-07, |
|
"loss": 1.3526, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.32992411745298583, |
|
"eval_accuracy": 0.688625248964108, |
|
"eval_loss": 1.3686386346817017, |
|
"eval_runtime": 311.2444, |
|
"eval_samples_per_second": 10.625, |
|
"eval_steps_per_second": 0.887, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.33029069980571135, |
|
"grad_norm": 4.016168592808031, |
|
"learning_rate": 7.557892715231634e-07, |
|
"loss": 1.3607, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 0.33065728215843687, |
|
"grad_norm": 3.504820069720998, |
|
"learning_rate": 7.552934219353638e-07, |
|
"loss": 1.3833, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 0.33102386451116245, |
|
"grad_norm": 3.3563895186210875, |
|
"learning_rate": 7.547972325064351e-07, |
|
"loss": 1.393, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 0.33139044686388797, |
|
"grad_norm": 3.401944814988902, |
|
"learning_rate": 7.543007038968939e-07, |
|
"loss": 1.3708, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 0.3317570292166135, |
|
"grad_norm": 4.8917426491539935, |
|
"learning_rate": 7.538038367677087e-07, |
|
"loss": 1.329, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.33212361156933906, |
|
"grad_norm": 4.014824315681244, |
|
"learning_rate": 7.53306631780298e-07, |
|
"loss": 1.3464, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 0.3324901939220646, |
|
"grad_norm": 3.9395593086417637, |
|
"learning_rate": 7.52809089596531e-07, |
|
"loss": 1.4059, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 0.33285677627479016, |
|
"grad_norm": 3.5141323515233274, |
|
"learning_rate": 7.523112108787247e-07, |
|
"loss": 1.3467, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 0.3332233586275157, |
|
"grad_norm": 4.310837199551292, |
|
"learning_rate": 7.518129962896448e-07, |
|
"loss": 1.3432, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 0.3335899409802412, |
|
"grad_norm": 4.049279934012434, |
|
"learning_rate": 7.513144464925036e-07, |
|
"loss": 1.4107, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.33395652333296677, |
|
"grad_norm": 5.43599736913238, |
|
"learning_rate": 7.508155621509603e-07, |
|
"loss": 1.3779, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 0.3343231056856923, |
|
"grad_norm": 4.312594101718665, |
|
"learning_rate": 7.503163439291187e-07, |
|
"loss": 1.3279, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 0.3346896880384178, |
|
"grad_norm": 3.7888042986131794, |
|
"learning_rate": 7.498167924915276e-07, |
|
"loss": 1.3422, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 0.3350562703911434, |
|
"grad_norm": 4.6227274755808665, |
|
"learning_rate": 7.493169085031791e-07, |
|
"loss": 1.3489, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 0.3354228527438689, |
|
"grad_norm": 4.440746888404653, |
|
"learning_rate": 7.48816692629508e-07, |
|
"loss": 1.3955, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.3357894350965944, |
|
"grad_norm": 3.1422454499623753, |
|
"learning_rate": 7.483161455363909e-07, |
|
"loss": 1.3613, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 0.33615601744932, |
|
"grad_norm": 3.894653506327936, |
|
"learning_rate": 7.478152678901455e-07, |
|
"loss": 1.4148, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 0.3365225998020455, |
|
"grad_norm": 5.433033949859381, |
|
"learning_rate": 7.473140603575294e-07, |
|
"loss": 1.3144, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 0.3368891821547711, |
|
"grad_norm": 3.975951714183405, |
|
"learning_rate": 7.468125236057392e-07, |
|
"loss": 1.3691, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 0.3372557645074966, |
|
"grad_norm": 4.918343199781564, |
|
"learning_rate": 7.463106583024099e-07, |
|
"loss": 1.3848, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.33762234686022213, |
|
"grad_norm": 4.865872631877682, |
|
"learning_rate": 7.458084651156138e-07, |
|
"loss": 1.3612, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 0.3379889292129477, |
|
"grad_norm": 4.124355883120795, |
|
"learning_rate": 7.453059447138597e-07, |
|
"loss": 1.3922, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 0.33835551156567323, |
|
"grad_norm": 3.4927433175723968, |
|
"learning_rate": 7.448030977660921e-07, |
|
"loss": 1.3209, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 0.33872209391839875, |
|
"grad_norm": 3.5565740075352887, |
|
"learning_rate": 7.4429992494169e-07, |
|
"loss": 1.3137, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 0.3390886762711243, |
|
"grad_norm": 3.2292820179583335, |
|
"learning_rate": 7.437964269104663e-07, |
|
"loss": 1.3469, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.33945525862384984, |
|
"grad_norm": 5.260253752526274, |
|
"learning_rate": 7.432926043426668e-07, |
|
"loss": 1.3067, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 0.33982184097657536, |
|
"grad_norm": 4.394976349303848, |
|
"learning_rate": 7.427884579089691e-07, |
|
"loss": 1.3423, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 0.34018842332930094, |
|
"grad_norm": 3.396422180187779, |
|
"learning_rate": 7.422839882804825e-07, |
|
"loss": 1.3449, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 0.34055500568202646, |
|
"grad_norm": 4.387777704799267, |
|
"learning_rate": 7.417791961287457e-07, |
|
"loss": 1.3274, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 0.34092158803475203, |
|
"grad_norm": 4.664699242153168, |
|
"learning_rate": 7.412740821257275e-07, |
|
"loss": 1.3147, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.34128817038747755, |
|
"grad_norm": 3.393736360787831, |
|
"learning_rate": 7.407686469438248e-07, |
|
"loss": 1.3934, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 0.34165475274020307, |
|
"grad_norm": 4.750927708757991, |
|
"learning_rate": 7.40262891255862e-07, |
|
"loss": 1.4067, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 0.34202133509292865, |
|
"grad_norm": 3.428169411059033, |
|
"learning_rate": 7.397568157350903e-07, |
|
"loss": 1.3411, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 0.34238791744565417, |
|
"grad_norm": 4.302469394811799, |
|
"learning_rate": 7.392504210551865e-07, |
|
"loss": 1.299, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 0.3427544997983797, |
|
"grad_norm": 7.00981557963566, |
|
"learning_rate": 7.387437078902523e-07, |
|
"loss": 1.3573, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.34312108215110526, |
|
"grad_norm": 5.566063359486336, |
|
"learning_rate": 7.382366769148136e-07, |
|
"loss": 1.3497, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 0.3434876645038308, |
|
"grad_norm": 3.4660448886166244, |
|
"learning_rate": 7.37729328803819e-07, |
|
"loss": 1.4092, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 0.3438542468565563, |
|
"grad_norm": 3.702869545438875, |
|
"learning_rate": 7.372216642326394e-07, |
|
"loss": 1.3603, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 0.3442208292092819, |
|
"grad_norm": 4.231146103126798, |
|
"learning_rate": 7.367136838770671e-07, |
|
"loss": 1.3428, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 0.3445874115620074, |
|
"grad_norm": 4.554271919619236, |
|
"learning_rate": 7.362053884133146e-07, |
|
"loss": 1.3311, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.34495399391473297, |
|
"grad_norm": 4.041325390537124, |
|
"learning_rate": 7.35696778518014e-07, |
|
"loss": 1.3471, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 0.3453205762674585, |
|
"grad_norm": 5.283681695413367, |
|
"learning_rate": 7.351878548682155e-07, |
|
"loss": 1.3334, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 0.345687158620184, |
|
"grad_norm": 4.104429136831335, |
|
"learning_rate": 7.34678618141388e-07, |
|
"loss": 1.3443, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 0.3460537409729096, |
|
"grad_norm": 4.637839526253117, |
|
"learning_rate": 7.341690690154161e-07, |
|
"loss": 1.3383, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 0.3464203233256351, |
|
"grad_norm": 6.447434633082354, |
|
"learning_rate": 7.336592081686007e-07, |
|
"loss": 1.3769, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.3467869056783606, |
|
"grad_norm": 4.989354934531907, |
|
"learning_rate": 7.331490362796579e-07, |
|
"loss": 1.3651, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 0.3471534880310862, |
|
"grad_norm": 4.121285832330203, |
|
"learning_rate": 7.326385540277171e-07, |
|
"loss": 1.319, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 0.3475200703838117, |
|
"grad_norm": 3.7909593948348284, |
|
"learning_rate": 7.321277620923217e-07, |
|
"loss": 1.3743, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 0.34788665273653724, |
|
"grad_norm": 3.3733089497346853, |
|
"learning_rate": 7.316166611534267e-07, |
|
"loss": 1.3743, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 0.3482532350892628, |
|
"grad_norm": 3.7253741770570823, |
|
"learning_rate": 7.311052518913989e-07, |
|
"loss": 1.2903, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.34861981744198833, |
|
"grad_norm": 4.039793671210928, |
|
"learning_rate": 7.305935349870155e-07, |
|
"loss": 1.2862, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 0.3489863997947139, |
|
"grad_norm": 4.342535349346429, |
|
"learning_rate": 7.300815111214628e-07, |
|
"loss": 1.3808, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 0.34935298214743943, |
|
"grad_norm": 5.42799281760455, |
|
"learning_rate": 7.29569180976336e-07, |
|
"loss": 1.3523, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 0.34971956450016495, |
|
"grad_norm": 5.020277916958928, |
|
"learning_rate": 7.290565452336381e-07, |
|
"loss": 1.3256, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 0.3500861468528905, |
|
"grad_norm": 4.373712918374428, |
|
"learning_rate": 7.285436045757789e-07, |
|
"loss": 1.2827, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.35045272920561604, |
|
"grad_norm": 6.179796353095443, |
|
"learning_rate": 7.280303596855737e-07, |
|
"loss": 1.3197, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 0.35081931155834156, |
|
"grad_norm": 5.167300912494304, |
|
"learning_rate": 7.275168112462433e-07, |
|
"loss": 1.331, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 0.35118589391106714, |
|
"grad_norm": 4.118700000532668, |
|
"learning_rate": 7.270029599414125e-07, |
|
"loss": 1.3529, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 0.35155247626379266, |
|
"grad_norm": 3.6038833094843516, |
|
"learning_rate": 7.264888064551089e-07, |
|
"loss": 1.3258, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 0.3519190586165182, |
|
"grad_norm": 3.5142758374979524, |
|
"learning_rate": 7.259743514717627e-07, |
|
"loss": 1.3377, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.35228564096924375, |
|
"grad_norm": 4.1250041287694685, |
|
"learning_rate": 7.254595956762053e-07, |
|
"loss": 1.3135, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 0.35265222332196927, |
|
"grad_norm": 3.132058137932181, |
|
"learning_rate": 7.249445397536686e-07, |
|
"loss": 1.3349, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 0.35301880567469485, |
|
"grad_norm": 3.399519224329254, |
|
"learning_rate": 7.244291843897839e-07, |
|
"loss": 1.3052, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 0.35338538802742037, |
|
"grad_norm": 4.712619284275666, |
|
"learning_rate": 7.239135302705816e-07, |
|
"loss": 1.3065, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 0.3537519703801459, |
|
"grad_norm": 3.734161433235809, |
|
"learning_rate": 7.23397578082489e-07, |
|
"loss": 1.3094, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.35411855273287146, |
|
"grad_norm": 5.100823292959423, |
|
"learning_rate": 7.228813285123308e-07, |
|
"loss": 1.3331, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 0.354485135085597, |
|
"grad_norm": 4.534677424827633, |
|
"learning_rate": 7.223647822473271e-07, |
|
"loss": 1.3912, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 0.3548517174383225, |
|
"grad_norm": 3.470979394380451, |
|
"learning_rate": 7.218479399750934e-07, |
|
"loss": 1.3476, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 0.3552182997910481, |
|
"grad_norm": 4.753775104454421, |
|
"learning_rate": 7.21330802383639e-07, |
|
"loss": 1.3167, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 0.3555848821437736, |
|
"grad_norm": 3.412263014571041, |
|
"learning_rate": 7.208133701613665e-07, |
|
"loss": 1.3358, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.3559514644964991, |
|
"grad_norm": 4.131601355517602, |
|
"learning_rate": 7.202956439970704e-07, |
|
"loss": 1.3244, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 0.3563180468492247, |
|
"grad_norm": 5.122163472630932, |
|
"learning_rate": 7.197776245799367e-07, |
|
"loss": 1.2796, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 0.3566846292019502, |
|
"grad_norm": 5.335391466451254, |
|
"learning_rate": 7.192593125995418e-07, |
|
"loss": 1.3161, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 0.3570512115546758, |
|
"grad_norm": 4.103339016303858, |
|
"learning_rate": 7.187407087458518e-07, |
|
"loss": 1.4146, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 0.3574177939074013, |
|
"grad_norm": 5.904708913785668, |
|
"learning_rate": 7.182218137092204e-07, |
|
"loss": 1.3092, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.3577843762601268, |
|
"grad_norm": 4.187532290173183, |
|
"learning_rate": 7.1770262818039e-07, |
|
"loss": 1.2946, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 0.3581509586128524, |
|
"grad_norm": 4.6467762537942, |
|
"learning_rate": 7.17183152850489e-07, |
|
"loss": 1.3212, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 0.3585175409655779, |
|
"grad_norm": 4.424491675585427, |
|
"learning_rate": 7.16663388411032e-07, |
|
"loss": 1.3167, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 0.35888412331830344, |
|
"grad_norm": 4.460602913760459, |
|
"learning_rate": 7.161433355539181e-07, |
|
"loss": 1.3514, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 0.359250705671029, |
|
"grad_norm": 7.380392542181771, |
|
"learning_rate": 7.156229949714307e-07, |
|
"loss": 1.305, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.35961728802375453, |
|
"grad_norm": 3.677155226574757, |
|
"learning_rate": 7.15102367356236e-07, |
|
"loss": 1.3175, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 0.35998387037648005, |
|
"grad_norm": 2.995203775176967, |
|
"learning_rate": 7.145814534013821e-07, |
|
"loss": 1.3833, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 0.36035045272920563, |
|
"grad_norm": 3.5086546677463364, |
|
"learning_rate": 7.140602538002989e-07, |
|
"loss": 1.3858, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 0.36071703508193115, |
|
"grad_norm": 3.523795917156669, |
|
"learning_rate": 7.135387692467957e-07, |
|
"loss": 1.3375, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 0.3610836174346567, |
|
"grad_norm": 3.7313877963514, |
|
"learning_rate": 7.130170004350617e-07, |
|
"loss": 1.3094, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.36145019978738224, |
|
"grad_norm": 4.442532041857861, |
|
"learning_rate": 7.124949480596644e-07, |
|
"loss": 1.3121, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 0.36181678214010776, |
|
"grad_norm": 5.641090705197642, |
|
"learning_rate": 7.119726128155487e-07, |
|
"loss": 1.3387, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 0.36218336449283334, |
|
"grad_norm": 9.369536303911914, |
|
"learning_rate": 7.114499953980362e-07, |
|
"loss": 1.3413, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 0.36254994684555886, |
|
"grad_norm": 4.32109030408511, |
|
"learning_rate": 7.109270965028238e-07, |
|
"loss": 1.3636, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 0.3629165291982844, |
|
"grad_norm": 6.871086039775216, |
|
"learning_rate": 7.104039168259834e-07, |
|
"loss": 1.352, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.36328311155100995, |
|
"grad_norm": 4.509944406939018, |
|
"learning_rate": 7.098804570639605e-07, |
|
"loss": 1.2874, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 0.36364969390373547, |
|
"grad_norm": 4.612863347134658, |
|
"learning_rate": 7.093567179135738e-07, |
|
"loss": 1.2676, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 0.364016276256461, |
|
"grad_norm": 4.091094769005595, |
|
"learning_rate": 7.088327000720131e-07, |
|
"loss": 1.3038, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 0.36438285860918657, |
|
"grad_norm": 4.977334963231582, |
|
"learning_rate": 7.083084042368401e-07, |
|
"loss": 1.3008, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 0.3647494409619121, |
|
"grad_norm": 5.166826475680081, |
|
"learning_rate": 7.077838311059862e-07, |
|
"loss": 1.2881, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.36511602331463766, |
|
"grad_norm": 4.01832965003142, |
|
"learning_rate": 7.072589813777518e-07, |
|
"loss": 1.3523, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 0.3654826056673632, |
|
"grad_norm": 3.8045628665321214, |
|
"learning_rate": 7.067338557508055e-07, |
|
"loss": 1.3155, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 0.3658491880200887, |
|
"grad_norm": 4.344284713227578, |
|
"learning_rate": 7.062084549241833e-07, |
|
"loss": 1.3314, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 0.3662157703728143, |
|
"grad_norm": 4.559382806632024, |
|
"learning_rate": 7.056827795972876e-07, |
|
"loss": 1.3242, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 0.3665823527255398, |
|
"grad_norm": 8.960735940046002, |
|
"learning_rate": 7.051568304698862e-07, |
|
"loss": 1.2563, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.3665823527255398, |
|
"eval_accuracy": 0.7009188125309459, |
|
"eval_loss": 1.3158118724822998, |
|
"eval_runtime": 311.2198, |
|
"eval_samples_per_second": 10.626, |
|
"eval_steps_per_second": 0.887, |
|
"step": 10000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 27279, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1085213557587968.0, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|