{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.995877233165369, "eval_steps": 500, "global_step": 1635, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01832340815391663, "grad_norm": 0.06011037901043892, "learning_rate": 4e-05, "loss": 1.296, "step": 10 }, { "epoch": 0.03664681630783326, "grad_norm": 0.05856110155582428, "learning_rate": 8e-05, "loss": 1.3316, "step": 20 }, { "epoch": 0.054970224461749886, "grad_norm": 0.0607464499771595, "learning_rate": 0.00012, "loss": 1.2794, "step": 30 }, { "epoch": 0.07329363261566652, "grad_norm": 0.06632011383771896, "learning_rate": 0.00016, "loss": 1.3129, "step": 40 }, { "epoch": 0.09161704076958314, "grad_norm": 0.06631691753864288, "learning_rate": 0.0002, "loss": 1.2741, "step": 50 }, { "epoch": 0.10994044892349977, "grad_norm": 0.056466877460479736, "learning_rate": 0.00019998035748930052, "loss": 1.2717, "step": 60 }, { "epoch": 0.1282638570774164, "grad_norm": 0.05860245227813721, "learning_rate": 0.00019992143767376668, "loss": 1.2091, "step": 70 }, { "epoch": 0.14658726523133303, "grad_norm": 0.06553175300359726, "learning_rate": 0.00019982326370006058, "loss": 1.1926, "step": 80 }, { "epoch": 0.16491067338524965, "grad_norm": 0.07061401754617691, "learning_rate": 0.00019968587413584876, "loss": 1.1767, "step": 90 }, { "epoch": 0.1832340815391663, "grad_norm": 0.07183243334293365, "learning_rate": 0.000199509322954651, "loss": 1.1183, "step": 100 }, { "epoch": 0.2015574896930829, "grad_norm": 0.06944898515939713, "learning_rate": 0.00019929367951463655, "loss": 1.0868, "step": 110 }, { "epoch": 0.21988089784699955, "grad_norm": 0.06642703711986542, "learning_rate": 0.00019903902853137703, "loss": 1.048, "step": 120 }, { "epoch": 0.23820430600091616, "grad_norm": 0.06603793054819107, "learning_rate": 0.00019874547004456562, "loss": 1.0195, "step": 130 }, { "epoch": 0.2565277141548328, "grad_norm": 0.06488285213708878, "learning_rate": 0.00019841311937871675, "loss": 1.0014, "step": 140 }, { "epoch": 0.2748511223087494, "grad_norm": 0.05940372124314308, "learning_rate": 0.0001980421070978606, "loss": 0.9943, "step": 150 }, { "epoch": 0.29317453046266606, "grad_norm": 0.059967171400785446, "learning_rate": 0.00019763257895425113, "loss": 0.9349, "step": 160 }, { "epoch": 0.3114979386165827, "grad_norm": 0.0554397851228714, "learning_rate": 0.0001971846958311071, "loss": 0.9045, "step": 170 }, { "epoch": 0.3298213467704993, "grad_norm": 0.055131904780864716, "learning_rate": 0.00019669863367940935, "loss": 0.8799, "step": 180 }, { "epoch": 0.34814475492441593, "grad_norm": 0.04358826205134392, "learning_rate": 0.00019617458344877816, "loss": 0.8504, "step": 190 }, { "epoch": 0.3664681630783326, "grad_norm": 0.04535752162337303, "learning_rate": 0.00019561275101245883, "loss": 0.828, "step": 200 }, { "epoch": 0.3847915712322492, "grad_norm": 0.04672062397003174, "learning_rate": 0.00019501335708644414, "loss": 0.8114, "step": 210 }, { "epoch": 0.4031149793861658, "grad_norm": 0.04161343351006508, "learning_rate": 0.00019437663714276618, "loss": 0.846, "step": 220 }, { "epoch": 0.42143838754008245, "grad_norm": 0.03887801244854927, "learning_rate": 0.0001937028413169911, "loss": 0.7911, "step": 230 }, { "epoch": 0.4397617956939991, "grad_norm": 0.03659196197986603, "learning_rate": 0.00019299223430995323, "loss": 0.7669, "step": 240 }, { "epoch": 0.45808520384791573, "grad_norm": 0.03447382524609566, "learning_rate": 0.00019224509528376738, "loss": 0.782, "step": 250 }, { "epoch": 0.4764086120018323, "grad_norm": 0.028725607320666313, "learning_rate": 0.00019146171775215982, "loss": 0.7183, "step": 260 }, { "epoch": 0.49473202015574896, "grad_norm": 0.027673941105604172, "learning_rate": 0.0001906424094651615, "loss": 0.7018, "step": 270 }, { "epoch": 0.5130554283096656, "grad_norm": 0.10227353870868683, "learning_rate": 0.00018978749228820826, "loss": 0.72, "step": 280 }, { "epoch": 0.5313788364635822, "grad_norm": 0.022650673985481262, "learning_rate": 0.00018889730207569607, "loss": 0.6936, "step": 290 }, { "epoch": 0.5497022446174988, "grad_norm": 0.023469725623726845, "learning_rate": 0.00018797218853904037, "loss": 0.6765, "step": 300 }, { "epoch": 0.5680256527714155, "grad_norm": 0.018101360648870468, "learning_rate": 0.000187012515109292, "loss": 0.6799, "step": 310 }, { "epoch": 0.5863490609253321, "grad_norm": 0.016794538125395775, "learning_rate": 0.00018601865879436317, "loss": 0.6732, "step": 320 }, { "epoch": 0.6046724690792488, "grad_norm": 0.017263714224100113, "learning_rate": 0.00018499101003091993, "loss": 0.6695, "step": 330 }, { "epoch": 0.6229958772331654, "grad_norm": 0.016381224617362022, "learning_rate": 0.0001839299725309989, "loss": 0.6928, "step": 340 }, { "epoch": 0.641319285387082, "grad_norm": 0.015325487591326237, "learning_rate": 0.00018283596312340891, "loss": 0.6622, "step": 350 }, { "epoch": 0.6596426935409986, "grad_norm": 0.014056784100830555, "learning_rate": 0.0001817094115899799, "loss": 0.7612, "step": 360 }, { "epoch": 0.6779661016949152, "grad_norm": 0.015031951479613781, "learning_rate": 0.00018055076049672283, "loss": 0.6596, "step": 370 }, { "epoch": 0.6962895098488319, "grad_norm": 0.01640532910823822, "learning_rate": 0.00017936046501996762, "loss": 0.6837, "step": 380 }, { "epoch": 0.7146129180027485, "grad_norm": 0.01830482669174671, "learning_rate": 0.000178138992767547, "loss": 0.6812, "step": 390 }, { "epoch": 0.7329363261566652, "grad_norm": 0.0472831092774868, "learning_rate": 0.00017688682359509678, "loss": 0.674, "step": 400 }, { "epoch": 0.7512597343105818, "grad_norm": 0.012456170283257961, "learning_rate": 0.00017560444941754427, "loss": 0.6518, "step": 410 }, { "epoch": 0.7695831424644984, "grad_norm": 0.01401186641305685, "learning_rate": 0.0001742923740158595, "loss": 0.6418, "step": 420 }, { "epoch": 0.7879065506184151, "grad_norm": 0.015530922450125217, "learning_rate": 0.00017295111283914487, "loss": 0.6465, "step": 430 }, { "epoch": 0.8062299587723316, "grad_norm": 0.01402275450527668, "learning_rate": 0.0001715811928021406, "loss": 0.6642, "step": 440 }, { "epoch": 0.8245533669262483, "grad_norm": 0.01176263578236103, "learning_rate": 0.0001701831520782264, "loss": 0.6336, "step": 450 }, { "epoch": 0.8428767750801649, "grad_norm": 0.013003438711166382, "learning_rate": 0.00016875753988799982, "loss": 0.6469, "step": 460 }, { "epoch": 0.8612001832340815, "grad_norm": 0.011523702181875706, "learning_rate": 0.00016730491628351487, "loss": 0.6434, "step": 470 }, { "epoch": 0.8795235913879982, "grad_norm": 0.011919384822249413, "learning_rate": 0.00016582585192826543, "loss": 0.6588, "step": 480 }, { "epoch": 0.8978469995419148, "grad_norm": 0.013994649983942509, "learning_rate": 0.00016432092787299992, "loss": 0.6315, "step": 490 }, { "epoch": 0.9161704076958315, "grad_norm": 0.013580686412751675, "learning_rate": 0.00016279073532745553, "loss": 0.6782, "step": 500 }, { "epoch": 0.934493815849748, "grad_norm": 0.01364163402467966, "learning_rate": 0.00016123587542810118, "loss": 0.6334, "step": 510 }, { "epoch": 0.9528172240036646, "grad_norm": 0.013080372475087643, "learning_rate": 0.0001596569590019811, "loss": 0.6233, "step": 520 }, { "epoch": 0.9711406321575813, "grad_norm": 0.056398555636405945, "learning_rate": 0.00015805460632675112, "loss": 0.6557, "step": 530 }, { "epoch": 0.9894640403114979, "grad_norm": 0.012467793188989162, "learning_rate": 0.00015642944688700264, "loss": 0.6315, "step": 540 }, { "epoch": 1.0077874484654146, "grad_norm": 0.012495579198002815, "learning_rate": 0.00015478211912696929, "loss": 0.6177, "step": 550 }, { "epoch": 1.026110856619331, "grad_norm": 0.010633349418640137, "learning_rate": 0.00015311327019971413, "loss": 0.644, "step": 560 }, { "epoch": 1.0444342647732479, "grad_norm": 0.012223353609442711, "learning_rate": 0.00015142355571289533, "loss": 0.6502, "step": 570 }, { "epoch": 1.0627576729271644, "grad_norm": 0.012305443175137043, "learning_rate": 0.00014971363947121065, "loss": 0.6185, "step": 580 }, { "epoch": 1.0810810810810811, "grad_norm": 0.017129750922322273, "learning_rate": 0.0001479841932156215, "loss": 0.6154, "step": 590 }, { "epoch": 1.0994044892349977, "grad_norm": 0.013129614293575287, "learning_rate": 0.0001462358963594595, "loss": 0.614, "step": 600 }, { "epoch": 1.1177278973889144, "grad_norm": 0.01199612207710743, "learning_rate": 0.00014446943572151867, "loss": 0.6128, "step": 610 }, { "epoch": 1.136051305542831, "grad_norm": 0.012518757954239845, "learning_rate": 0.00014268550525623868, "loss": 0.6169, "step": 620 }, { "epoch": 1.1543747136967477, "grad_norm": 0.01321893185377121, "learning_rate": 0.00014088480578108454, "loss": 0.6402, "step": 630 }, { "epoch": 1.1726981218506642, "grad_norm": 0.012497123330831528, "learning_rate": 0.00013906804470123038, "loss": 0.613, "step": 640 }, { "epoch": 1.1910215300045808, "grad_norm": 0.01103185210376978, "learning_rate": 0.00013723593573165523, "loss": 0.6114, "step": 650 }, { "epoch": 1.2093449381584975, "grad_norm": 0.012833209708333015, "learning_rate": 0.00013538919861675979, "loss": 0.617, "step": 660 }, { "epoch": 1.227668346312414, "grad_norm": 0.058991171419620514, "learning_rate": 0.0001335285588476148, "loss": 0.6298, "step": 670 }, { "epoch": 1.2459917544663308, "grad_norm": 0.013424506410956383, "learning_rate": 0.00013165474737695184, "loss": 0.6488, "step": 680 }, { "epoch": 1.2643151626202473, "grad_norm": 0.01241598092019558, "learning_rate": 0.00012976850033200805, "loss": 0.6088, "step": 690 }, { "epoch": 1.2826385707741639, "grad_norm": 0.012560844421386719, "learning_rate": 0.00012787055872533865, "loss": 0.6032, "step": 700 }, { "epoch": 1.3009619789280806, "grad_norm": 0.010990115813910961, "learning_rate": 0.00012596166816371005, "loss": 0.6282, "step": 710 }, { "epoch": 1.3192853870819974, "grad_norm": 0.01151216309517622, "learning_rate": 0.00012404257855518782, "loss": 0.6297, "step": 720 }, { "epoch": 1.337608795235914, "grad_norm": 0.011924243532121181, "learning_rate": 0.0001221140438145353, "loss": 0.6044, "step": 730 }, { "epoch": 1.3559322033898304, "grad_norm": 0.013133584521710873, "learning_rate": 0.00012017682156703807, "loss": 0.6107, "step": 740 }, { "epoch": 1.3742556115437472, "grad_norm": 0.014631664380431175, "learning_rate": 0.00011823167285087063, "loss": 0.6213, "step": 750 }, { "epoch": 1.3925790196976637, "grad_norm": 0.011716130189597607, "learning_rate": 0.00011627936181812234, "loss": 0.6179, "step": 760 }, { "epoch": 1.4109024278515805, "grad_norm": 0.013568080961704254, "learning_rate": 0.00011432065543460015, "loss": 0.5965, "step": 770 }, { "epoch": 1.429225836005497, "grad_norm": 0.012749516405165195, "learning_rate": 0.00011235632317852605, "loss": 0.6128, "step": 780 }, { "epoch": 1.4475492441594136, "grad_norm": 0.011930575594305992, "learning_rate": 0.00011038713673824715, "loss": 0.6117, "step": 790 }, { "epoch": 1.4658726523133303, "grad_norm": 0.013386845588684082, "learning_rate": 0.00010841386970907785, "loss": 0.6186, "step": 800 }, { "epoch": 1.4841960604672468, "grad_norm": 0.012542261742055416, "learning_rate": 0.00010643729728939292, "loss": 0.5909, "step": 810 }, { "epoch": 1.5025194686211636, "grad_norm": 0.010874781757593155, "learning_rate": 0.0001044581959760903, "loss": 0.5903, "step": 820 }, { "epoch": 1.5208428767750801, "grad_norm": 0.010801080614328384, "learning_rate": 0.00010247734325954447, "loss": 0.5929, "step": 830 }, { "epoch": 1.5391662849289967, "grad_norm": 0.012022151611745358, "learning_rate": 0.00010049551731816902, "loss": 0.6117, "step": 840 }, { "epoch": 1.5574896930829134, "grad_norm": 0.011683526448905468, "learning_rate": 9.851349671270909e-05, "loss": 0.6283, "step": 850 }, { "epoch": 1.5758131012368302, "grad_norm": 0.01242094673216343, "learning_rate": 9.653206008038364e-05, "loss": 0.5901, "step": 860 }, { "epoch": 1.5941365093907467, "grad_norm": 0.011935061775147915, "learning_rate": 9.455198582899774e-05, "loss": 0.5848, "step": 870 }, { "epoch": 1.6124599175446632, "grad_norm": 0.01208607666194439, "learning_rate": 9.257405183114473e-05, "loss": 0.5912, "step": 880 }, { "epoch": 1.63078332569858, "grad_norm": 0.01227467879652977, "learning_rate": 9.059903511861891e-05, "loss": 0.5859, "step": 890 }, { "epoch": 1.6491067338524965, "grad_norm": 0.0517101027071476, "learning_rate": 8.862771157715847e-05, "loss": 0.607, "step": 900 }, { "epoch": 1.6674301420064133, "grad_norm": 0.010776874609291553, "learning_rate": 8.666085564163852e-05, "loss": 0.5788, "step": 910 }, { "epoch": 1.6857535501603298, "grad_norm": 0.013487796299159527, "learning_rate": 8.469923999183411e-05, "loss": 0.5766, "step": 920 }, { "epoch": 1.7040769583142463, "grad_norm": 0.011671481654047966, "learning_rate": 8.274363524887315e-05, "loss": 0.5976, "step": 930 }, { "epoch": 1.722400366468163, "grad_norm": 0.01118433102965355, "learning_rate": 8.079480967249737e-05, "loss": 0.6021, "step": 940 }, { "epoch": 1.7407237746220798, "grad_norm": 0.013788875192403793, "learning_rate": 7.88535288592514e-05, "loss": 0.579, "step": 950 }, { "epoch": 1.7590471827759964, "grad_norm": 0.013310333713889122, "learning_rate": 7.692055544171823e-05, "loss": 0.5979, "step": 960 }, { "epoch": 1.777370590929913, "grad_norm": 0.053230684250593185, "learning_rate": 7.49966487889185e-05, "loss": 0.5906, "step": 970 }, { "epoch": 1.7956939990838294, "grad_norm": 0.013503102585673332, "learning_rate": 7.308256470799256e-05, "loss": 0.6061, "step": 980 }, { "epoch": 1.8140174072377462, "grad_norm": 0.01030020508915186, "learning_rate": 7.117905514728107e-05, "loss": 0.5776, "step": 990 }, { "epoch": 1.832340815391663, "grad_norm": 0.01204043161123991, "learning_rate": 6.928686790092235e-05, "loss": 0.584, "step": 1000 }, { "epoch": 1.8506642235455795, "grad_norm": 0.011951706372201443, "learning_rate": 6.740674631508105e-05, "loss": 0.5693, "step": 1010 }, { "epoch": 1.868987631699496, "grad_norm": 0.011565761640667915, "learning_rate": 6.553942899592447e-05, "loss": 0.594, "step": 1020 }, { "epoch": 1.8873110398534128, "grad_norm": 0.011589662171900272, "learning_rate": 6.368564951946103e-05, "loss": 0.5961, "step": 1030 }, { "epoch": 1.9056344480073295, "grad_norm": 0.011061927303671837, "learning_rate": 6.184613614335476e-05, "loss": 0.5906, "step": 1040 }, { "epoch": 1.923957856161246, "grad_norm": 0.011009737849235535, "learning_rate": 6.002161152082909e-05, "loss": 0.5703, "step": 1050 }, { "epoch": 1.9422812643151626, "grad_norm": 0.012721202336251736, "learning_rate": 5.8212792416772374e-05, "loss": 0.5728, "step": 1060 }, { "epoch": 1.960604672469079, "grad_norm": 0.014589471742510796, "learning_rate": 5.6420389426156814e-05, "loss": 0.5722, "step": 1070 }, { "epoch": 1.9789280806229959, "grad_norm": 0.011400967836380005, "learning_rate": 5.464510669488073e-05, "loss": 0.5699, "step": 1080 }, { "epoch": 1.9972514887769126, "grad_norm": 0.011737444438040257, "learning_rate": 5.288764164314499e-05, "loss": 0.5872, "step": 1090 }, { "epoch": 2.015574896930829, "grad_norm": 0.011182826943695545, "learning_rate": 5.1148684691471304e-05, "loss": 0.5722, "step": 1100 }, { "epoch": 2.0338983050847457, "grad_norm": 0.011314311996102333, "learning_rate": 4.942891898947024e-05, "loss": 0.5697, "step": 1110 }, { "epoch": 2.052221713238662, "grad_norm": 0.04409461468458176, "learning_rate": 4.772902014746583e-05, "loss": 0.6125, "step": 1120 }, { "epoch": 2.070545121392579, "grad_norm": 0.01187494769692421, "learning_rate": 4.6049655971081916e-05, "loss": 0.5878, "step": 1130 }, { "epoch": 2.0888685295464957, "grad_norm": 0.013561342842876911, "learning_rate": 4.439148619889453e-05, "loss": 0.5681, "step": 1140 }, { "epoch": 2.1071919377004122, "grad_norm": 0.01049109362065792, "learning_rate": 4.2755162243253554e-05, "loss": 0.5731, "step": 1150 }, { "epoch": 2.1255153458543288, "grad_norm": 0.012508846819400787, "learning_rate": 4.114132693437511e-05, "loss": 0.5683, "step": 1160 }, { "epoch": 2.1438387540082458, "grad_norm": 0.011165381409227848, "learning_rate": 3.955061426780562e-05, "loss": 0.6208, "step": 1170 }, { "epoch": 2.1621621621621623, "grad_norm": 0.010869491845369339, "learning_rate": 3.7983649155356536e-05, "loss": 0.5706, "step": 1180 }, { "epoch": 2.180485570316079, "grad_norm": 0.012921607121825218, "learning_rate": 3.644104717960761e-05, "loss": 0.5615, "step": 1190 }, { "epoch": 2.1988089784699953, "grad_norm": 0.010369219817221165, "learning_rate": 3.492341435207509e-05, "loss": 0.5602, "step": 1200 }, { "epoch": 2.217132386623912, "grad_norm": 0.05246191471815109, "learning_rate": 3.343134687514007e-05, "loss": 0.5924, "step": 1210 }, { "epoch": 2.235455794777829, "grad_norm": 0.04603949189186096, "learning_rate": 3.1965430907830166e-05, "loss": 0.5805, "step": 1220 }, { "epoch": 2.2537792029317454, "grad_norm": 0.01212139893323183, "learning_rate": 3.0526242335546716e-05, "loss": 0.5783, "step": 1230 }, { "epoch": 2.272102611085662, "grad_norm": 0.010512073524296284, "learning_rate": 2.911434654382842e-05, "loss": 0.5734, "step": 1240 }, { "epoch": 2.2904260192395784, "grad_norm": 0.011081124655902386, "learning_rate": 2.773029819623917e-05, "loss": 0.5688, "step": 1250 }, { "epoch": 2.3087494273934954, "grad_norm": 0.013736708089709282, "learning_rate": 2.6374641016468416e-05, "loss": 0.6005, "step": 1260 }, { "epoch": 2.327072835547412, "grad_norm": 0.010449289344251156, "learning_rate": 2.5047907574729456e-05, "loss": 0.5788, "step": 1270 }, { "epoch": 2.3453962437013285, "grad_norm": 0.017625920474529266, "learning_rate": 2.375061907853867e-05, "loss": 0.5613, "step": 1280 }, { "epoch": 2.363719651855245, "grad_norm": 0.012163090519607067, "learning_rate": 2.24832851679594e-05, "loss": 0.5709, "step": 1290 }, { "epoch": 2.3820430600091616, "grad_norm": 0.010039297863841057, "learning_rate": 2.1246403715389674e-05, "loss": 0.5677, "step": 1300 }, { "epoch": 2.4003664681630785, "grad_norm": 0.012111610732972622, "learning_rate": 2.0040460629972792e-05, "loss": 0.5898, "step": 1310 }, { "epoch": 2.418689876316995, "grad_norm": 0.012733533047139645, "learning_rate": 1.8865929666707904e-05, "loss": 0.5982, "step": 1320 }, { "epoch": 2.4370132844709116, "grad_norm": 0.011774188838899136, "learning_rate": 1.7723272240335265e-05, "loss": 0.5649, "step": 1330 }, { "epoch": 2.455336692624828, "grad_norm": 0.013299621641635895, "learning_rate": 1.6612937244069328e-05, "loss": 0.5775, "step": 1340 }, { "epoch": 2.4736601007787447, "grad_norm": 0.012371544726192951, "learning_rate": 1.5535360873251027e-05, "loss": 0.5657, "step": 1350 }, { "epoch": 2.4919835089326616, "grad_norm": 0.013111775740981102, "learning_rate": 1.4490966453988187e-05, "loss": 0.5814, "step": 1360 }, { "epoch": 2.510306917086578, "grad_norm": 0.011858658865094185, "learning_rate": 1.3480164276851926e-05, "loss": 0.562, "step": 1370 }, { "epoch": 2.5286303252404947, "grad_norm": 0.017958352342247963, "learning_rate": 1.2503351435693811e-05, "loss": 0.5882, "step": 1380 }, { "epoch": 2.5469537333944112, "grad_norm": 0.012084643356502056, "learning_rate": 1.1560911671647535e-05, "loss": 0.6038, "step": 1390 }, { "epoch": 2.5652771415483278, "grad_norm": 0.012955896556377411, "learning_rate": 1.0653215222376045e-05, "loss": 0.5603, "step": 1400 }, { "epoch": 2.5836005497022447, "grad_norm": 0.010830081067979336, "learning_rate": 9.78061867662372e-06, "loss": 0.5596, "step": 1410 }, { "epoch": 2.6019239578561613, "grad_norm": 0.009481418877840042, "learning_rate": 8.943464834130289e-06, "loss": 0.567, "step": 1420 }, { "epoch": 2.620247366010078, "grad_norm": 0.009960277937352657, "learning_rate": 8.14208257096185e-06, "loss": 0.5627, "step": 1430 }, { "epoch": 2.6385707741639948, "grad_norm": 0.011430955491960049, "learning_rate": 7.376786710312045e-06, "loss": 0.5642, "step": 1440 }, { "epoch": 2.6568941823179113, "grad_norm": 0.010986811481416225, "learning_rate": 6.647877898823462e-06, "loss": 0.5637, "step": 1450 }, { "epoch": 2.675217590471828, "grad_norm": 0.011870593763887882, "learning_rate": 5.955642488478675e-06, "loss": 0.5611, "step": 1460 }, { "epoch": 2.6935409986257444, "grad_norm": 0.013593550771474838, "learning_rate": 5.300352424106781e-06, "loss": 0.5611, "step": 1470 }, { "epoch": 2.711864406779661, "grad_norm": 0.01099549513310194, "learning_rate": 4.682265136549768e-06, "loss": 0.5616, "step": 1480 }, { "epoch": 2.7301878149335774, "grad_norm": 0.010328186675906181, "learning_rate": 4.1016234415308555e-06, "loss": 0.5648, "step": 1490 }, { "epoch": 2.7485112230874944, "grad_norm": 0.012961314059793949, "learning_rate": 3.5586554442641583e-06, "loss": 0.5791, "step": 1500 }, { "epoch": 2.766834631241411, "grad_norm": 0.010046335868537426, "learning_rate": 3.0535744498435993e-06, "loss": 0.5641, "step": 1510 }, { "epoch": 2.7851580393953275, "grad_norm": 0.010482456535100937, "learning_rate": 2.5865788794459223e-06, "loss": 0.5629, "step": 1520 }, { "epoch": 2.803481447549244, "grad_norm": 0.015615337528288364, "learning_rate": 2.1578521923808713e-06, "loss": 0.5698, "step": 1530 }, { "epoch": 2.821804855703161, "grad_norm": 0.012672988697886467, "learning_rate": 1.767562814019208e-06, "loss": 0.5662, "step": 1540 }, { "epoch": 2.8401282638570775, "grad_norm": 0.011427894234657288, "learning_rate": 1.41586406962676e-06, "loss": 0.5991, "step": 1550 }, { "epoch": 2.858451672010994, "grad_norm": 0.011262394487857819, "learning_rate": 1.1028941241305046e-06, "loss": 0.562, "step": 1560 }, { "epoch": 2.8767750801649106, "grad_norm": 0.011531077325344086, "learning_rate": 8.287759278405083e-07, "loss": 0.5806, "step": 1570 }, { "epoch": 2.895098488318827, "grad_norm": 0.01074717566370964, "learning_rate": 5.936171681488301e-07, "loss": 0.5649, "step": 1580 }, { "epoch": 2.913421896472744, "grad_norm": 0.011177337728440762, "learning_rate": 3.9751022722455123e-07, "loss": 0.5719, "step": 1590 }, { "epoch": 2.9317453046266606, "grad_norm": 0.013694767840206623, "learning_rate": 2.4053214572137275e-07, "loss": 0.58, "step": 1600 }, { "epoch": 2.950068712780577, "grad_norm": 0.01134565845131874, "learning_rate": 1.2274459251220282e-07, "loss": 0.5592, "step": 1610 }, { "epoch": 2.9683921209344937, "grad_norm": 0.009969279170036316, "learning_rate": 4.4193840462536384e-08, "loss": 0.5669, "step": 1620 }, { "epoch": 2.9867155290884106, "grad_norm": 0.010615854524075985, "learning_rate": 4.9107482521071335e-09, "loss": 0.5561, "step": 1630 } ], "logging_steps": 10, "max_steps": 1635, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2063088450732032e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }