{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 24236, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00412609341475491, "grad_norm": 14.695940971374512, "learning_rate": 1e-05, "loss": 8.626, "step": 100 }, { "epoch": 0.00825218682950982, "grad_norm": 19.695737838745117, "learning_rate": 2e-05, "loss": 7.5714, "step": 200 }, { "epoch": 0.01237828024426473, "grad_norm": 7.499746799468994, "learning_rate": 3e-05, "loss": 6.9595, "step": 300 }, { "epoch": 0.01650437365901964, "grad_norm": 22.69700813293457, "learning_rate": 4e-05, "loss": 5.6622, "step": 400 }, { "epoch": 0.02063046707377455, "grad_norm": 16.763093948364258, "learning_rate": 5e-05, "loss": 1.9119, "step": 500 }, { "epoch": 0.02475656048852946, "grad_norm": 3.74697208404541, "learning_rate": 4.978934951129087e-05, "loss": 1.6244, "step": 600 }, { "epoch": 0.02888265390328437, "grad_norm": 6.804798126220703, "learning_rate": 4.957869902258174e-05, "loss": 1.4867, "step": 700 }, { "epoch": 0.03300874731803928, "grad_norm": 4.382748603820801, "learning_rate": 4.93680485338726e-05, "loss": 1.4434, "step": 800 }, { "epoch": 0.03713484073279419, "grad_norm": 2.020482063293457, "learning_rate": 4.9157398045163464e-05, "loss": 1.3701, "step": 900 }, { "epoch": 0.0412609341475491, "grad_norm": 1.9225651025772095, "learning_rate": 4.894674755645433e-05, "loss": 1.383, "step": 1000 }, { "epoch": 0.04538702756230401, "grad_norm": 1.8698792457580566, "learning_rate": 4.87360970677452e-05, "loss": 1.3512, "step": 1100 }, { "epoch": 0.04951312097705892, "grad_norm": 4.490991592407227, "learning_rate": 4.852544657903607e-05, "loss": 1.3039, "step": 1200 }, { "epoch": 0.05363921439181383, "grad_norm": 2.697434186935425, "learning_rate": 4.831479609032693e-05, "loss": 1.3085, "step": 1300 }, { "epoch": 0.05776530780656874, "grad_norm": 3.3568286895751953, "learning_rate": 4.81041456016178e-05, "loss": 1.2915, "step": 1400 }, { "epoch": 0.06189140122132365, "grad_norm": 2.012889862060547, "learning_rate": 4.789349511290866e-05, "loss": 1.2778, "step": 1500 }, { "epoch": 0.06601749463607856, "grad_norm": 4.024045467376709, "learning_rate": 4.768284462419953e-05, "loss": 1.2688, "step": 1600 }, { "epoch": 0.07014358805083347, "grad_norm": 2.241870880126953, "learning_rate": 4.7472194135490394e-05, "loss": 1.2592, "step": 1700 }, { "epoch": 0.07426968146558838, "grad_norm": 3.7178213596343994, "learning_rate": 4.7261543646781266e-05, "loss": 1.2112, "step": 1800 }, { "epoch": 0.07839577488034329, "grad_norm": 2.036505937576294, "learning_rate": 4.705089315807213e-05, "loss": 1.2126, "step": 1900 }, { "epoch": 0.0825218682950982, "grad_norm": 1.7717580795288086, "learning_rate": 4.6840242669362997e-05, "loss": 1.191, "step": 2000 }, { "epoch": 0.08664796170985312, "grad_norm": 1.8078298568725586, "learning_rate": 4.662959218065386e-05, "loss": 1.199, "step": 2100 }, { "epoch": 0.09077405512460802, "grad_norm": 4.067634582519531, "learning_rate": 4.641894169194473e-05, "loss": 1.1415, "step": 2200 }, { "epoch": 0.09490014853936293, "grad_norm": 1.9144686460494995, "learning_rate": 4.620829120323559e-05, "loss": 1.1486, "step": 2300 }, { "epoch": 0.09902624195411784, "grad_norm": 1.4644508361816406, "learning_rate": 4.599764071452646e-05, "loss": 1.1665, "step": 2400 }, { "epoch": 0.10315233536887275, "grad_norm": 1.6876461505889893, "learning_rate": 4.578699022581733e-05, "loss": 1.1197, "step": 2500 }, { "epoch": 0.10727842878362766, "grad_norm": 1.6274546384811401, "learning_rate": 4.5576339737108196e-05, "loss": 1.1505, "step": 2600 }, { "epoch": 0.11140452219838257, "grad_norm": 2.2767255306243896, "learning_rate": 4.5365689248399054e-05, "loss": 1.1376, "step": 2700 }, { "epoch": 0.11553061561313747, "grad_norm": 2.0346932411193848, "learning_rate": 4.5155038759689926e-05, "loss": 1.0817, "step": 2800 }, { "epoch": 0.1196567090278924, "grad_norm": 1.5901857614517212, "learning_rate": 4.494438827098079e-05, "loss": 1.0919, "step": 2900 }, { "epoch": 0.1237828024426473, "grad_norm": 2.1194183826446533, "learning_rate": 4.473373778227166e-05, "loss": 1.0813, "step": 3000 }, { "epoch": 0.1279088958574022, "grad_norm": 4.601478576660156, "learning_rate": 4.452308729356252e-05, "loss": 1.1204, "step": 3100 }, { "epoch": 0.13203498927215712, "grad_norm": 2.3545944690704346, "learning_rate": 4.431243680485339e-05, "loss": 1.0488, "step": 3200 }, { "epoch": 0.13616108268691204, "grad_norm": 1.468526005744934, "learning_rate": 4.410178631614425e-05, "loss": 1.1108, "step": 3300 }, { "epoch": 0.14028717610166694, "grad_norm": 1.5844910144805908, "learning_rate": 4.389113582743512e-05, "loss": 1.1114, "step": 3400 }, { "epoch": 0.14441326951642186, "grad_norm": 4.228846073150635, "learning_rate": 4.368048533872599e-05, "loss": 1.0961, "step": 3500 }, { "epoch": 0.14853936293117675, "grad_norm": 1.4088937044143677, "learning_rate": 4.3469834850016856e-05, "loss": 1.0758, "step": 3600 }, { "epoch": 0.15266545634593168, "grad_norm": 1.6613353490829468, "learning_rate": 4.325918436130772e-05, "loss": 1.0916, "step": 3700 }, { "epoch": 0.15679154976068657, "grad_norm": 1.6648322343826294, "learning_rate": 4.304853387259859e-05, "loss": 1.0784, "step": 3800 }, { "epoch": 0.1609176431754415, "grad_norm": 1.387666940689087, "learning_rate": 4.283788338388945e-05, "loss": 1.0894, "step": 3900 }, { "epoch": 0.1650437365901964, "grad_norm": 1.416618824005127, "learning_rate": 4.262723289518032e-05, "loss": 1.0283, "step": 4000 }, { "epoch": 0.1691698300049513, "grad_norm": 1.569306492805481, "learning_rate": 4.241658240647118e-05, "loss": 1.0513, "step": 4100 }, { "epoch": 0.17329592341970623, "grad_norm": 1.4747782945632935, "learning_rate": 4.2205931917762055e-05, "loss": 1.0531, "step": 4200 }, { "epoch": 0.17742201683446113, "grad_norm": 2.1925017833709717, "learning_rate": 4.199528142905292e-05, "loss": 1.058, "step": 4300 }, { "epoch": 0.18154811024921605, "grad_norm": 1.9729565382003784, "learning_rate": 4.1784630940343786e-05, "loss": 1.0806, "step": 4400 }, { "epoch": 0.18567420366397094, "grad_norm": 1.5546541213989258, "learning_rate": 4.1573980451634644e-05, "loss": 1.0746, "step": 4500 }, { "epoch": 0.18980029707872587, "grad_norm": 1.8433148860931396, "learning_rate": 4.1363329962925517e-05, "loss": 1.0527, "step": 4600 }, { "epoch": 0.19392639049348076, "grad_norm": 1.3673489093780518, "learning_rate": 4.115267947421638e-05, "loss": 1.0772, "step": 4700 }, { "epoch": 0.19805248390823568, "grad_norm": 1.8290094137191772, "learning_rate": 4.094202898550725e-05, "loss": 1.0424, "step": 4800 }, { "epoch": 0.2021785773229906, "grad_norm": 2.0811445713043213, "learning_rate": 4.073137849679812e-05, "loss": 1.0498, "step": 4900 }, { "epoch": 0.2063046707377455, "grad_norm": 1.2849047183990479, "learning_rate": 4.0520728008088985e-05, "loss": 1.0341, "step": 5000 }, { "epoch": 0.21043076415250042, "grad_norm": 3.288480281829834, "learning_rate": 4.0310077519379843e-05, "loss": 1.0273, "step": 5100 }, { "epoch": 0.21455685756725532, "grad_norm": 1.2297766208648682, "learning_rate": 4.009942703067071e-05, "loss": 1.0768, "step": 5200 }, { "epoch": 0.21868295098201024, "grad_norm": 1.4169477224349976, "learning_rate": 3.988877654196158e-05, "loss": 1.0334, "step": 5300 }, { "epoch": 0.22280904439676513, "grad_norm": 1.314010739326477, "learning_rate": 3.9678126053252446e-05, "loss": 1.0208, "step": 5400 }, { "epoch": 0.22693513781152005, "grad_norm": 1.5176063776016235, "learning_rate": 3.946747556454331e-05, "loss": 1.0561, "step": 5500 }, { "epoch": 0.23106123122627495, "grad_norm": 1.418303370475769, "learning_rate": 3.9256825075834184e-05, "loss": 1.0223, "step": 5600 }, { "epoch": 0.23518732464102987, "grad_norm": 1.8820278644561768, "learning_rate": 3.904617458712504e-05, "loss": 1.0333, "step": 5700 }, { "epoch": 0.2393134180557848, "grad_norm": 1.7610660791397095, "learning_rate": 3.883552409841591e-05, "loss": 1.007, "step": 5800 }, { "epoch": 0.2434395114705397, "grad_norm": 7.877830982208252, "learning_rate": 3.862487360970677e-05, "loss": 1.0138, "step": 5900 }, { "epoch": 0.2475656048852946, "grad_norm": 1.4319870471954346, "learning_rate": 3.8414223120997645e-05, "loss": 1.0103, "step": 6000 }, { "epoch": 0.2516916983000495, "grad_norm": 1.4879227876663208, "learning_rate": 3.820357263228851e-05, "loss": 1.0219, "step": 6100 }, { "epoch": 0.2558177917148044, "grad_norm": 1.3280787467956543, "learning_rate": 3.7992922143579376e-05, "loss": 1.0157, "step": 6200 }, { "epoch": 0.25994388512955935, "grad_norm": 2.4915549755096436, "learning_rate": 3.778227165487024e-05, "loss": 1.0422, "step": 6300 }, { "epoch": 0.26406997854431424, "grad_norm": 1.3016897439956665, "learning_rate": 3.757162116616111e-05, "loss": 1.004, "step": 6400 }, { "epoch": 0.26819607195906914, "grad_norm": 1.722939372062683, "learning_rate": 3.736097067745197e-05, "loss": 1.0196, "step": 6500 }, { "epoch": 0.2723221653738241, "grad_norm": 1.4764331579208374, "learning_rate": 3.715032018874284e-05, "loss": 0.9871, "step": 6600 }, { "epoch": 0.276448258788579, "grad_norm": 1.344777226448059, "learning_rate": 3.693966970003371e-05, "loss": 1.0249, "step": 6700 }, { "epoch": 0.2805743522033339, "grad_norm": 1.1756465435028076, "learning_rate": 3.6729019211324575e-05, "loss": 1.0506, "step": 6800 }, { "epoch": 0.28470044561808877, "grad_norm": 1.3845124244689941, "learning_rate": 3.6518368722615434e-05, "loss": 1.0041, "step": 6900 }, { "epoch": 0.2888265390328437, "grad_norm": 1.074078917503357, "learning_rate": 3.6307718233906306e-05, "loss": 0.9849, "step": 7000 }, { "epoch": 0.2929526324475986, "grad_norm": 2.0719516277313232, "learning_rate": 3.609706774519717e-05, "loss": 1.0022, "step": 7100 }, { "epoch": 0.2970787258623535, "grad_norm": 1.1381429433822632, "learning_rate": 3.5886417256488037e-05, "loss": 0.9409, "step": 7200 }, { "epoch": 0.30120481927710846, "grad_norm": 1.2426626682281494, "learning_rate": 3.56757667677789e-05, "loss": 1.0225, "step": 7300 }, { "epoch": 0.30533091269186335, "grad_norm": 2.445568561553955, "learning_rate": 3.5465116279069774e-05, "loss": 0.9725, "step": 7400 }, { "epoch": 0.30945700610661825, "grad_norm": 1.2126537561416626, "learning_rate": 3.525446579036063e-05, "loss": 1.0005, "step": 7500 }, { "epoch": 0.31358309952137314, "grad_norm": 2.634969472885132, "learning_rate": 3.50438153016515e-05, "loss": 1.0079, "step": 7600 }, { "epoch": 0.3177091929361281, "grad_norm": 1.4859946966171265, "learning_rate": 3.483316481294237e-05, "loss": 1.0192, "step": 7700 }, { "epoch": 0.321835286350883, "grad_norm": 1.3265373706817627, "learning_rate": 3.4622514324233236e-05, "loss": 0.9836, "step": 7800 }, { "epoch": 0.3259613797656379, "grad_norm": 1.569514513015747, "learning_rate": 3.44118638355241e-05, "loss": 1.002, "step": 7900 }, { "epoch": 0.3300874731803928, "grad_norm": 1.718145728111267, "learning_rate": 3.4201213346814966e-05, "loss": 0.9599, "step": 8000 }, { "epoch": 0.3342135665951477, "grad_norm": 1.2960829734802246, "learning_rate": 3.399056285810583e-05, "loss": 1.0286, "step": 8100 }, { "epoch": 0.3383396600099026, "grad_norm": 1.3030658960342407, "learning_rate": 3.37799123693967e-05, "loss": 0.9592, "step": 8200 }, { "epoch": 0.3424657534246575, "grad_norm": 1.6679294109344482, "learning_rate": 3.356926188068756e-05, "loss": 0.9823, "step": 8300 }, { "epoch": 0.34659184683941247, "grad_norm": 1.079559326171875, "learning_rate": 3.335861139197843e-05, "loss": 0.9749, "step": 8400 }, { "epoch": 0.35071794025416736, "grad_norm": 1.27901029586792, "learning_rate": 3.31479609032693e-05, "loss": 0.9801, "step": 8500 }, { "epoch": 0.35484403366892225, "grad_norm": 1.292656421661377, "learning_rate": 3.2937310414560165e-05, "loss": 0.9824, "step": 8600 }, { "epoch": 0.35897012708367715, "grad_norm": 1.2524762153625488, "learning_rate": 3.272665992585103e-05, "loss": 0.943, "step": 8700 }, { "epoch": 0.3630962204984321, "grad_norm": 2.4386353492736816, "learning_rate": 3.2516009437141896e-05, "loss": 0.9738, "step": 8800 }, { "epoch": 0.367222313913187, "grad_norm": 1.2332638502120972, "learning_rate": 3.230535894843276e-05, "loss": 0.9599, "step": 8900 }, { "epoch": 0.3713484073279419, "grad_norm": 1.3955186605453491, "learning_rate": 3.209470845972363e-05, "loss": 1.0027, "step": 9000 }, { "epoch": 0.37547450074269684, "grad_norm": 1.7736716270446777, "learning_rate": 3.188405797101449e-05, "loss": 0.9568, "step": 9100 }, { "epoch": 0.37960059415745173, "grad_norm": 1.1282614469528198, "learning_rate": 3.1673407482305364e-05, "loss": 0.9892, "step": 9200 }, { "epoch": 0.3837266875722066, "grad_norm": 4.226625442504883, "learning_rate": 3.146275699359622e-05, "loss": 0.957, "step": 9300 }, { "epoch": 0.3878527809869615, "grad_norm": 1.3062007427215576, "learning_rate": 3.125210650488709e-05, "loss": 0.9702, "step": 9400 }, { "epoch": 0.39197887440171647, "grad_norm": 1.5109843015670776, "learning_rate": 3.104145601617796e-05, "loss": 0.9406, "step": 9500 }, { "epoch": 0.39610496781647136, "grad_norm": 1.2154899835586548, "learning_rate": 3.0830805527468826e-05, "loss": 0.9532, "step": 9600 }, { "epoch": 0.40023106123122626, "grad_norm": 1.239396095275879, "learning_rate": 3.062015503875969e-05, "loss": 0.9484, "step": 9700 }, { "epoch": 0.4043571546459812, "grad_norm": 1.3215525150299072, "learning_rate": 3.0409504550050553e-05, "loss": 0.978, "step": 9800 }, { "epoch": 0.4084832480607361, "grad_norm": 1.149057149887085, "learning_rate": 3.0198854061341425e-05, "loss": 0.968, "step": 9900 }, { "epoch": 0.412609341475491, "grad_norm": 1.271074652671814, "learning_rate": 2.998820357263229e-05, "loss": 0.9794, "step": 10000 }, { "epoch": 0.4167354348902459, "grad_norm": 1.0992262363433838, "learning_rate": 2.9777553083923153e-05, "loss": 0.9594, "step": 10100 }, { "epoch": 0.42086152830500084, "grad_norm": 1.1205365657806396, "learning_rate": 2.9566902595214025e-05, "loss": 0.9439, "step": 10200 }, { "epoch": 0.42498762171975574, "grad_norm": 1.144080638885498, "learning_rate": 2.935625210650489e-05, "loss": 0.9745, "step": 10300 }, { "epoch": 0.42911371513451063, "grad_norm": 3.2051868438720703, "learning_rate": 2.9145601617795752e-05, "loss": 0.9594, "step": 10400 }, { "epoch": 0.4332398085492656, "grad_norm": 1.2232369184494019, "learning_rate": 2.8934951129086618e-05, "loss": 0.9644, "step": 10500 }, { "epoch": 0.4373659019640205, "grad_norm": 1.3971831798553467, "learning_rate": 2.872430064037749e-05, "loss": 0.987, "step": 10600 }, { "epoch": 0.44149199537877537, "grad_norm": 1.1187039613723755, "learning_rate": 2.8513650151668352e-05, "loss": 0.9657, "step": 10700 }, { "epoch": 0.44561808879353026, "grad_norm": 1.1717453002929688, "learning_rate": 2.8302999662959217e-05, "loss": 0.9363, "step": 10800 }, { "epoch": 0.4497441822082852, "grad_norm": 1.4479399919509888, "learning_rate": 2.809234917425009e-05, "loss": 0.9428, "step": 10900 }, { "epoch": 0.4538702756230401, "grad_norm": 1.1537368297576904, "learning_rate": 2.788169868554095e-05, "loss": 0.9654, "step": 11000 }, { "epoch": 0.457996369037795, "grad_norm": 1.9704123735427856, "learning_rate": 2.7671048196831817e-05, "loss": 0.944, "step": 11100 }, { "epoch": 0.4621224624525499, "grad_norm": 1.3609466552734375, "learning_rate": 2.7460397708122682e-05, "loss": 0.9353, "step": 11200 }, { "epoch": 0.46624855586730485, "grad_norm": 1.3835324048995972, "learning_rate": 2.724974721941355e-05, "loss": 0.9238, "step": 11300 }, { "epoch": 0.47037464928205974, "grad_norm": 2.1749815940856934, "learning_rate": 2.7039096730704416e-05, "loss": 0.9215, "step": 11400 }, { "epoch": 0.47450074269681464, "grad_norm": 1.941735863685608, "learning_rate": 2.682844624199528e-05, "loss": 0.9607, "step": 11500 }, { "epoch": 0.4786268361115696, "grad_norm": 1.9667292833328247, "learning_rate": 2.661779575328615e-05, "loss": 0.9493, "step": 11600 }, { "epoch": 0.4827529295263245, "grad_norm": 1.1912260055541992, "learning_rate": 2.6407145264577016e-05, "loss": 0.9364, "step": 11700 }, { "epoch": 0.4868790229410794, "grad_norm": 1.2728015184402466, "learning_rate": 2.619649477586788e-05, "loss": 0.9135, "step": 11800 }, { "epoch": 0.49100511635583427, "grad_norm": 1.326409935951233, "learning_rate": 2.5985844287158746e-05, "loss": 0.9665, "step": 11900 }, { "epoch": 0.4951312097705892, "grad_norm": 1.4567406177520752, "learning_rate": 2.5775193798449615e-05, "loss": 0.9733, "step": 12000 }, { "epoch": 0.4992573031853441, "grad_norm": 1.3147661685943604, "learning_rate": 2.556454330974048e-05, "loss": 0.9204, "step": 12100 }, { "epoch": 0.503383396600099, "grad_norm": 1.6704838275909424, "learning_rate": 2.5353892821031346e-05, "loss": 0.9418, "step": 12200 }, { "epoch": 0.5075094900148539, "grad_norm": 1.2493371963500977, "learning_rate": 2.5143242332322215e-05, "loss": 0.9441, "step": 12300 }, { "epoch": 0.5116355834296088, "grad_norm": 1.2380743026733398, "learning_rate": 2.493259184361308e-05, "loss": 0.9642, "step": 12400 }, { "epoch": 0.5157616768443638, "grad_norm": 1.487196922302246, "learning_rate": 2.4721941354903942e-05, "loss": 0.986, "step": 12500 }, { "epoch": 0.5198877702591187, "grad_norm": 1.2720383405685425, "learning_rate": 2.451129086619481e-05, "loss": 0.9614, "step": 12600 }, { "epoch": 0.5240138636738736, "grad_norm": 1.3985182046890259, "learning_rate": 2.4300640377485676e-05, "loss": 0.9327, "step": 12700 }, { "epoch": 0.5281399570886285, "grad_norm": 1.2555489540100098, "learning_rate": 2.408998988877654e-05, "loss": 0.9331, "step": 12800 }, { "epoch": 0.5322660505033834, "grad_norm": 1.083095908164978, "learning_rate": 2.387933940006741e-05, "loss": 0.9706, "step": 12900 }, { "epoch": 0.5363921439181383, "grad_norm": 3.2246696949005127, "learning_rate": 2.3668688911358276e-05, "loss": 0.9267, "step": 13000 }, { "epoch": 0.5405182373328932, "grad_norm": 1.2211159467697144, "learning_rate": 2.345803842264914e-05, "loss": 0.9315, "step": 13100 }, { "epoch": 0.5446443307476482, "grad_norm": 1.3726495504379272, "learning_rate": 2.3247387933940006e-05, "loss": 0.9432, "step": 13200 }, { "epoch": 0.5487704241624031, "grad_norm": 1.0996991395950317, "learning_rate": 2.3036737445230875e-05, "loss": 0.9213, "step": 13300 }, { "epoch": 0.552896517577158, "grad_norm": 1.016136884689331, "learning_rate": 2.282608695652174e-05, "loss": 0.9625, "step": 13400 }, { "epoch": 0.5570226109919129, "grad_norm": 1.1178189516067505, "learning_rate": 2.2615436467812606e-05, "loss": 0.9419, "step": 13500 }, { "epoch": 0.5611487044066678, "grad_norm": 1.1706444025039673, "learning_rate": 2.2404785979103475e-05, "loss": 0.885, "step": 13600 }, { "epoch": 0.5652747978214226, "grad_norm": 1.4330129623413086, "learning_rate": 2.2194135490394337e-05, "loss": 0.9024, "step": 13700 }, { "epoch": 0.5694008912361775, "grad_norm": 2.2776172161102295, "learning_rate": 2.1983485001685205e-05, "loss": 0.933, "step": 13800 }, { "epoch": 0.5735269846509325, "grad_norm": 1.3359657526016235, "learning_rate": 2.177283451297607e-05, "loss": 0.8791, "step": 13900 }, { "epoch": 0.5776530780656874, "grad_norm": 1.1592367887496948, "learning_rate": 2.1562184024266936e-05, "loss": 0.9336, "step": 14000 }, { "epoch": 0.5817791714804423, "grad_norm": 1.052618145942688, "learning_rate": 2.13515335355578e-05, "loss": 0.9303, "step": 14100 }, { "epoch": 0.5859052648951972, "grad_norm": 1.2330833673477173, "learning_rate": 2.114088304684867e-05, "loss": 0.9247, "step": 14200 }, { "epoch": 0.5900313583099521, "grad_norm": 1.7336995601654053, "learning_rate": 2.0930232558139536e-05, "loss": 0.9078, "step": 14300 }, { "epoch": 0.594157451724707, "grad_norm": 1.1562308073043823, "learning_rate": 2.07195820694304e-05, "loss": 0.905, "step": 14400 }, { "epoch": 0.5982835451394619, "grad_norm": 1.3212171792984009, "learning_rate": 2.050893158072127e-05, "loss": 0.9457, "step": 14500 }, { "epoch": 0.6024096385542169, "grad_norm": 1.5021255016326904, "learning_rate": 2.0298281092012135e-05, "loss": 0.9213, "step": 14600 }, { "epoch": 0.6065357319689718, "grad_norm": 1.1142035722732544, "learning_rate": 2.0087630603303e-05, "loss": 0.8988, "step": 14700 }, { "epoch": 0.6106618253837267, "grad_norm": 1.0887188911437988, "learning_rate": 1.9876980114593866e-05, "loss": 0.9579, "step": 14800 }, { "epoch": 0.6147879187984816, "grad_norm": 1.5622923374176025, "learning_rate": 1.966632962588473e-05, "loss": 0.9206, "step": 14900 }, { "epoch": 0.6189140122132365, "grad_norm": 1.4978774785995483, "learning_rate": 1.94556791371756e-05, "loss": 0.9292, "step": 15000 }, { "epoch": 0.6230401056279914, "grad_norm": 1.1494709253311157, "learning_rate": 1.9245028648466465e-05, "loss": 0.929, "step": 15100 }, { "epoch": 0.6271661990427463, "grad_norm": 3.5858824253082275, "learning_rate": 1.903437815975733e-05, "loss": 0.9181, "step": 15200 }, { "epoch": 0.6312922924575013, "grad_norm": 0.927173376083374, "learning_rate": 1.8823727671048196e-05, "loss": 0.9365, "step": 15300 }, { "epoch": 0.6354183858722562, "grad_norm": 0.9943380355834961, "learning_rate": 1.8613077182339065e-05, "loss": 0.8974, "step": 15400 }, { "epoch": 0.6395444792870111, "grad_norm": 1.4820857048034668, "learning_rate": 1.840242669362993e-05, "loss": 0.9066, "step": 15500 }, { "epoch": 0.643670572701766, "grad_norm": 1.3542896509170532, "learning_rate": 1.8191776204920796e-05, "loss": 0.9048, "step": 15600 }, { "epoch": 0.6477966661165209, "grad_norm": 2.233414888381958, "learning_rate": 1.7981125716211664e-05, "loss": 0.899, "step": 15700 }, { "epoch": 0.6519227595312758, "grad_norm": 1.0770349502563477, "learning_rate": 1.777047522750253e-05, "loss": 0.9135, "step": 15800 }, { "epoch": 0.6560488529460307, "grad_norm": 1.1688830852508545, "learning_rate": 1.7559824738793395e-05, "loss": 0.8838, "step": 15900 }, { "epoch": 0.6601749463607856, "grad_norm": 1.096822738647461, "learning_rate": 1.734917425008426e-05, "loss": 0.9325, "step": 16000 }, { "epoch": 0.6643010397755406, "grad_norm": 1.4621776342391968, "learning_rate": 1.713852376137513e-05, "loss": 0.9299, "step": 16100 }, { "epoch": 0.6684271331902955, "grad_norm": 1.2400994300842285, "learning_rate": 1.692787327266599e-05, "loss": 0.8986, "step": 16200 }, { "epoch": 0.6725532266050503, "grad_norm": 1.3540397882461548, "learning_rate": 1.671722278395686e-05, "loss": 0.9084, "step": 16300 }, { "epoch": 0.6766793200198052, "grad_norm": 1.2045152187347412, "learning_rate": 1.6506572295247725e-05, "loss": 0.8943, "step": 16400 }, { "epoch": 0.6808054134345601, "grad_norm": 1.1521943807601929, "learning_rate": 1.629592180653859e-05, "loss": 0.9089, "step": 16500 }, { "epoch": 0.684931506849315, "grad_norm": 4.699136257171631, "learning_rate": 1.608527131782946e-05, "loss": 0.9169, "step": 16600 }, { "epoch": 0.6890576002640699, "grad_norm": 1.3759478330612183, "learning_rate": 1.5874620829120325e-05, "loss": 0.9154, "step": 16700 }, { "epoch": 0.6931836936788249, "grad_norm": 1.2098520994186401, "learning_rate": 1.566397034041119e-05, "loss": 0.9264, "step": 16800 }, { "epoch": 0.6973097870935798, "grad_norm": 1.6775233745574951, "learning_rate": 1.5453319851702056e-05, "loss": 0.9309, "step": 16900 }, { "epoch": 0.7014358805083347, "grad_norm": 1.0574172735214233, "learning_rate": 1.5242669362992923e-05, "loss": 0.8893, "step": 17000 }, { "epoch": 0.7055619739230896, "grad_norm": 1.035610318183899, "learning_rate": 1.503201887428379e-05, "loss": 0.9243, "step": 17100 }, { "epoch": 0.7096880673378445, "grad_norm": 1.6291944980621338, "learning_rate": 1.4821368385574655e-05, "loss": 0.9158, "step": 17200 }, { "epoch": 0.7138141607525994, "grad_norm": 1.2090740203857422, "learning_rate": 1.4610717896865522e-05, "loss": 0.9026, "step": 17300 }, { "epoch": 0.7179402541673543, "grad_norm": 1.2179425954818726, "learning_rate": 1.4400067408156388e-05, "loss": 0.8943, "step": 17400 }, { "epoch": 0.7220663475821093, "grad_norm": 1.2382631301879883, "learning_rate": 1.4189416919447255e-05, "loss": 0.9021, "step": 17500 }, { "epoch": 0.7261924409968642, "grad_norm": 2.4923956394195557, "learning_rate": 1.3978766430738118e-05, "loss": 0.9348, "step": 17600 }, { "epoch": 0.7303185344116191, "grad_norm": 2.419496774673462, "learning_rate": 1.3768115942028985e-05, "loss": 0.9039, "step": 17700 }, { "epoch": 0.734444627826374, "grad_norm": 1.2352160215377808, "learning_rate": 1.3557465453319854e-05, "loss": 0.9266, "step": 17800 }, { "epoch": 0.7385707212411289, "grad_norm": 1.0967360734939575, "learning_rate": 1.3346814964610718e-05, "loss": 0.9046, "step": 17900 }, { "epoch": 0.7426968146558838, "grad_norm": 1.0056049823760986, "learning_rate": 1.3136164475901585e-05, "loss": 0.9321, "step": 18000 }, { "epoch": 0.7468229080706387, "grad_norm": 1.9823698997497559, "learning_rate": 1.292551398719245e-05, "loss": 0.9151, "step": 18100 }, { "epoch": 0.7509490014853937, "grad_norm": 1.651145577430725, "learning_rate": 1.2714863498483317e-05, "loss": 0.904, "step": 18200 }, { "epoch": 0.7550750949001486, "grad_norm": 0.9505665302276611, "learning_rate": 1.2504213009774183e-05, "loss": 0.878, "step": 18300 }, { "epoch": 0.7592011883149035, "grad_norm": 1.558278203010559, "learning_rate": 1.229356252106505e-05, "loss": 0.942, "step": 18400 }, { "epoch": 0.7633272817296584, "grad_norm": 1.2101174592971802, "learning_rate": 1.2082912032355915e-05, "loss": 0.9034, "step": 18500 }, { "epoch": 0.7674533751444133, "grad_norm": 1.2382097244262695, "learning_rate": 1.1872261543646782e-05, "loss": 0.9119, "step": 18600 }, { "epoch": 0.7715794685591681, "grad_norm": 1.1424338817596436, "learning_rate": 1.1661611054937648e-05, "loss": 0.9282, "step": 18700 }, { "epoch": 0.775705561973923, "grad_norm": 1.0747746229171753, "learning_rate": 1.1450960566228513e-05, "loss": 0.9144, "step": 18800 }, { "epoch": 0.779831655388678, "grad_norm": 1.4378238916397095, "learning_rate": 1.1240310077519382e-05, "loss": 0.9292, "step": 18900 }, { "epoch": 0.7839577488034329, "grad_norm": 1.5118451118469238, "learning_rate": 1.1029659588810247e-05, "loss": 0.8532, "step": 19000 }, { "epoch": 0.7880838422181878, "grad_norm": 1.135190725326538, "learning_rate": 1.0819009100101113e-05, "loss": 0.9132, "step": 19100 }, { "epoch": 0.7922099356329427, "grad_norm": 1.3497545719146729, "learning_rate": 1.060835861139198e-05, "loss": 0.9214, "step": 19200 }, { "epoch": 0.7963360290476976, "grad_norm": 1.3251924514770508, "learning_rate": 1.0397708122682845e-05, "loss": 0.8942, "step": 19300 }, { "epoch": 0.8004621224624525, "grad_norm": 2.453803539276123, "learning_rate": 1.018705763397371e-05, "loss": 0.8858, "step": 19400 }, { "epoch": 0.8045882158772074, "grad_norm": 1.1651134490966797, "learning_rate": 9.976407145264577e-06, "loss": 0.9012, "step": 19500 }, { "epoch": 0.8087143092919624, "grad_norm": 2.257159471511841, "learning_rate": 9.765756656555444e-06, "loss": 0.9167, "step": 19600 }, { "epoch": 0.8128404027067173, "grad_norm": 0.9240596294403076, "learning_rate": 9.55510616784631e-06, "loss": 0.8764, "step": 19700 }, { "epoch": 0.8169664961214722, "grad_norm": 1.2550618648529053, "learning_rate": 9.344455679137177e-06, "loss": 0.8998, "step": 19800 }, { "epoch": 0.8210925895362271, "grad_norm": 1.2276984453201294, "learning_rate": 9.133805190428042e-06, "loss": 0.909, "step": 19900 }, { "epoch": 0.825218682950982, "grad_norm": 1.0953816175460815, "learning_rate": 8.923154701718908e-06, "loss": 0.8931, "step": 20000 }, { "epoch": 0.8293447763657369, "grad_norm": 1.469269037246704, "learning_rate": 8.712504213009775e-06, "loss": 0.8789, "step": 20100 }, { "epoch": 0.8334708697804918, "grad_norm": 1.242390751838684, "learning_rate": 8.50185372430064e-06, "loss": 0.9126, "step": 20200 }, { "epoch": 0.8375969631952468, "grad_norm": 1.0811703205108643, "learning_rate": 8.291203235591507e-06, "loss": 0.913, "step": 20300 }, { "epoch": 0.8417230566100017, "grad_norm": 1.0523350238800049, "learning_rate": 8.080552746882374e-06, "loss": 0.9118, "step": 20400 }, { "epoch": 0.8458491500247566, "grad_norm": 1.4592727422714233, "learning_rate": 7.86990225817324e-06, "loss": 0.9099, "step": 20500 }, { "epoch": 0.8499752434395115, "grad_norm": 1.0648339986801147, "learning_rate": 7.659251769464105e-06, "loss": 0.9198, "step": 20600 }, { "epoch": 0.8541013368542664, "grad_norm": 1.3053339719772339, "learning_rate": 7.448601280754971e-06, "loss": 0.9156, "step": 20700 }, { "epoch": 0.8582274302690213, "grad_norm": 1.0929012298583984, "learning_rate": 7.237950792045837e-06, "loss": 0.9103, "step": 20800 }, { "epoch": 0.8623535236837762, "grad_norm": 1.234263300895691, "learning_rate": 7.027300303336704e-06, "loss": 0.9016, "step": 20900 }, { "epoch": 0.8664796170985312, "grad_norm": 1.3241745233535767, "learning_rate": 6.816649814627571e-06, "loss": 0.9075, "step": 21000 }, { "epoch": 0.8706057105132861, "grad_norm": 1.2847357988357544, "learning_rate": 6.605999325918437e-06, "loss": 0.9462, "step": 21100 }, { "epoch": 0.874731803928041, "grad_norm": 1.1206868886947632, "learning_rate": 6.395348837209303e-06, "loss": 0.9236, "step": 21200 }, { "epoch": 0.8788578973427958, "grad_norm": 1.1748895645141602, "learning_rate": 6.1846983485001685e-06, "loss": 0.8521, "step": 21300 }, { "epoch": 0.8829839907575507, "grad_norm": 1.571519136428833, "learning_rate": 5.974047859791035e-06, "loss": 0.8811, "step": 21400 }, { "epoch": 0.8871100841723056, "grad_norm": 1.051316738128662, "learning_rate": 5.763397371081901e-06, "loss": 0.9167, "step": 21500 }, { "epoch": 0.8912361775870605, "grad_norm": 2.111393690109253, "learning_rate": 5.552746882372767e-06, "loss": 0.9333, "step": 21600 }, { "epoch": 0.8953622710018155, "grad_norm": 1.3969411849975586, "learning_rate": 5.342096393663633e-06, "loss": 0.9434, "step": 21700 }, { "epoch": 0.8994883644165704, "grad_norm": 1.7783890962600708, "learning_rate": 5.1314459049545e-06, "loss": 0.8947, "step": 21800 }, { "epoch": 0.9036144578313253, "grad_norm": 1.359174132347107, "learning_rate": 4.920795416245366e-06, "loss": 0.8815, "step": 21900 }, { "epoch": 0.9077405512460802, "grad_norm": 1.257117748260498, "learning_rate": 4.710144927536232e-06, "loss": 0.8986, "step": 22000 }, { "epoch": 0.9118666446608351, "grad_norm": 0.9748762845993042, "learning_rate": 4.499494438827098e-06, "loss": 0.887, "step": 22100 }, { "epoch": 0.91599273807559, "grad_norm": 1.5360727310180664, "learning_rate": 4.2888439501179645e-06, "loss": 0.9051, "step": 22200 }, { "epoch": 0.9201188314903449, "grad_norm": 1.0747774839401245, "learning_rate": 4.078193461408831e-06, "loss": 0.9498, "step": 22300 }, { "epoch": 0.9242449249050998, "grad_norm": 1.197403073310852, "learning_rate": 3.867542972699697e-06, "loss": 0.925, "step": 22400 }, { "epoch": 0.9283710183198548, "grad_norm": 1.580825924873352, "learning_rate": 3.6568924839905627e-06, "loss": 0.9019, "step": 22500 }, { "epoch": 0.9324971117346097, "grad_norm": 1.2338446378707886, "learning_rate": 3.4462419952814294e-06, "loss": 0.892, "step": 22600 }, { "epoch": 0.9366232051493646, "grad_norm": 2.6846702098846436, "learning_rate": 3.2355915065722956e-06, "loss": 0.9121, "step": 22700 }, { "epoch": 0.9407492985641195, "grad_norm": 1.0765039920806885, "learning_rate": 3.0249410178631614e-06, "loss": 0.9101, "step": 22800 }, { "epoch": 0.9448753919788744, "grad_norm": 1.2273006439208984, "learning_rate": 2.814290529154028e-06, "loss": 0.8916, "step": 22900 }, { "epoch": 0.9490014853936293, "grad_norm": 1.295823574066162, "learning_rate": 2.603640040444894e-06, "loss": 0.8958, "step": 23000 }, { "epoch": 0.9531275788083842, "grad_norm": 1.5502872467041016, "learning_rate": 2.39298955173576e-06, "loss": 0.9069, "step": 23100 }, { "epoch": 0.9572536722231392, "grad_norm": 2.524392604827881, "learning_rate": 2.1823390630266263e-06, "loss": 0.9301, "step": 23200 }, { "epoch": 0.9613797656378941, "grad_norm": 1.5065919160842896, "learning_rate": 1.9716885743174925e-06, "loss": 0.8903, "step": 23300 }, { "epoch": 0.965505859052649, "grad_norm": 1.1356451511383057, "learning_rate": 1.761038085608359e-06, "loss": 0.881, "step": 23400 }, { "epoch": 0.9696319524674039, "grad_norm": 6.048961162567139, "learning_rate": 1.550387596899225e-06, "loss": 0.9164, "step": 23500 }, { "epoch": 0.9737580458821588, "grad_norm": 1.1151750087738037, "learning_rate": 1.339737108190091e-06, "loss": 0.8703, "step": 23600 }, { "epoch": 0.9778841392969136, "grad_norm": 1.0021706819534302, "learning_rate": 1.1290866194809571e-06, "loss": 0.8978, "step": 23700 }, { "epoch": 0.9820102327116685, "grad_norm": 1.1401609182357788, "learning_rate": 9.184361307718234e-07, "loss": 0.9159, "step": 23800 }, { "epoch": 0.9861363261264235, "grad_norm": 1.0629512071609497, "learning_rate": 7.077856420626896e-07, "loss": 0.9073, "step": 23900 }, { "epoch": 0.9902624195411784, "grad_norm": 1.0977869033813477, "learning_rate": 4.971351533535558e-07, "loss": 0.9178, "step": 24000 }, { "epoch": 0.9943885129559333, "grad_norm": 1.281014323234558, "learning_rate": 2.8648466464442196e-07, "loss": 0.8876, "step": 24100 }, { "epoch": 0.9985146063706882, "grad_norm": 1.2343029975891113, "learning_rate": 7.583417593528817e-08, "loss": 0.8733, "step": 24200 } ], "logging_steps": 100, "max_steps": 24236, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.002725362748621e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }