{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997037037037036, "eval_steps": 5000, "global_step": 1687, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005925925925925926, "grad_norm": 0.5658125281333923, "learning_rate": 2.0000000000000003e-06, "loss": 1.9859, "step": 10 }, { "epoch": 0.011851851851851851, "grad_norm": 0.4840392470359802, "learning_rate": 4.000000000000001e-06, "loss": 1.9552, "step": 20 }, { "epoch": 0.017777777777777778, "grad_norm": 0.5106642246246338, "learning_rate": 6e-06, "loss": 1.8641, "step": 30 }, { "epoch": 0.023703703703703703, "grad_norm": 0.3882107436656952, "learning_rate": 8.000000000000001e-06, "loss": 1.8952, "step": 40 }, { "epoch": 0.02962962962962963, "grad_norm": 0.5257787704467773, "learning_rate": 1e-05, "loss": 2.0206, "step": 50 }, { "epoch": 0.035555555555555556, "grad_norm": 0.466878741979599, "learning_rate": 1.2e-05, "loss": 1.8342, "step": 60 }, { "epoch": 0.04148148148148148, "grad_norm": 0.5169644355773926, "learning_rate": 1.4000000000000001e-05, "loss": 1.9108, "step": 70 }, { "epoch": 0.047407407407407405, "grad_norm": 0.5135151743888855, "learning_rate": 1.6000000000000003e-05, "loss": 2.0776, "step": 80 }, { "epoch": 0.05333333333333334, "grad_norm": 0.6329066157341003, "learning_rate": 1.8e-05, "loss": 2.0294, "step": 90 }, { "epoch": 0.05925925925925926, "grad_norm": 0.5724173188209534, "learning_rate": 2e-05, "loss": 2.0369, "step": 100 }, { "epoch": 0.06518518518518518, "grad_norm": 0.6717743277549744, "learning_rate": 2.2000000000000003e-05, "loss": 2.0009, "step": 110 }, { "epoch": 0.07111111111111111, "grad_norm": 0.7047879099845886, "learning_rate": 2.4e-05, "loss": 2.1022, "step": 120 }, { "epoch": 0.07703703703703704, "grad_norm": 0.5536562204360962, "learning_rate": 2.6000000000000002e-05, "loss": 1.9473, "step": 130 }, { "epoch": 0.08296296296296296, "grad_norm": 0.7499417066574097, "learning_rate": 2.8000000000000003e-05, "loss": 1.8281, "step": 140 }, { "epoch": 0.08888888888888889, "grad_norm": 0.767043948173523, "learning_rate": 3e-05, "loss": 2.0454, "step": 150 }, { "epoch": 0.09481481481481481, "grad_norm": 0.717535138130188, "learning_rate": 3.2000000000000005e-05, "loss": 1.9561, "step": 160 }, { "epoch": 0.10074074074074074, "grad_norm": 0.6414211392402649, "learning_rate": 3.4000000000000007e-05, "loss": 1.9112, "step": 170 }, { "epoch": 0.10666666666666667, "grad_norm": 0.6495911478996277, "learning_rate": 3.6e-05, "loss": 1.96, "step": 180 }, { "epoch": 0.11259259259259259, "grad_norm": 0.6383408308029175, "learning_rate": 3.8e-05, "loss": 1.9678, "step": 190 }, { "epoch": 0.11851851851851852, "grad_norm": 0.5086091756820679, "learning_rate": 4e-05, "loss": 1.863, "step": 200 }, { "epoch": 0.12444444444444444, "grad_norm": 0.6146989464759827, "learning_rate": 4.2e-05, "loss": 1.8957, "step": 210 }, { "epoch": 0.13037037037037036, "grad_norm": 0.8208276629447937, "learning_rate": 4.4000000000000006e-05, "loss": 1.8206, "step": 220 }, { "epoch": 0.1362962962962963, "grad_norm": 0.7568028569221497, "learning_rate": 4.600000000000001e-05, "loss": 1.952, "step": 230 }, { "epoch": 0.14222222222222222, "grad_norm": 0.7779513597488403, "learning_rate": 4.8e-05, "loss": 2.037, "step": 240 }, { "epoch": 0.14814814814814814, "grad_norm": 0.6543411016464233, "learning_rate": 5e-05, "loss": 1.9939, "step": 250 }, { "epoch": 0.15407407407407409, "grad_norm": 0.6476520895957947, "learning_rate": 5.2000000000000004e-05, "loss": 1.8528, "step": 260 }, { "epoch": 0.16, "grad_norm": 0.5776610374450684, "learning_rate": 5.4000000000000005e-05, "loss": 2.0216, "step": 270 }, { "epoch": 0.16592592592592592, "grad_norm": 0.6913073658943176, "learning_rate": 5.6000000000000006e-05, "loss": 2.0332, "step": 280 }, { "epoch": 0.17185185185185184, "grad_norm": 0.7126554250717163, "learning_rate": 5.8e-05, "loss": 2.071, "step": 290 }, { "epoch": 0.17777777777777778, "grad_norm": 0.7475311756134033, "learning_rate": 6e-05, "loss": 2.0481, "step": 300 }, { "epoch": 0.1837037037037037, "grad_norm": 0.6156488656997681, "learning_rate": 6.2e-05, "loss": 2.0077, "step": 310 }, { "epoch": 0.18962962962962962, "grad_norm": 0.6974404454231262, "learning_rate": 6.400000000000001e-05, "loss": 2.0649, "step": 320 }, { "epoch": 0.19555555555555557, "grad_norm": 0.6685057282447815, "learning_rate": 6.6e-05, "loss": 2.0685, "step": 330 }, { "epoch": 0.20148148148148148, "grad_norm": 0.5366894006729126, "learning_rate": 6.800000000000001e-05, "loss": 1.8826, "step": 340 }, { "epoch": 0.2074074074074074, "grad_norm": 0.6276523470878601, "learning_rate": 7e-05, "loss": 1.9935, "step": 350 }, { "epoch": 0.21333333333333335, "grad_norm": 0.5763794779777527, "learning_rate": 7.2e-05, "loss": 1.7043, "step": 360 }, { "epoch": 0.21925925925925926, "grad_norm": 0.7299754023551941, "learning_rate": 7.4e-05, "loss": 1.9996, "step": 370 }, { "epoch": 0.22518518518518518, "grad_norm": 0.7397586107254028, "learning_rate": 7.6e-05, "loss": 2.0128, "step": 380 }, { "epoch": 0.2311111111111111, "grad_norm": 0.5862900614738464, "learning_rate": 7.800000000000001e-05, "loss": 1.8096, "step": 390 }, { "epoch": 0.23703703703703705, "grad_norm": 0.5037183165550232, "learning_rate": 8e-05, "loss": 2.0381, "step": 400 }, { "epoch": 0.24296296296296296, "grad_norm": 0.5471982359886169, "learning_rate": 8.2e-05, "loss": 1.9769, "step": 410 }, { "epoch": 0.24888888888888888, "grad_norm": 0.6831430792808533, "learning_rate": 8.4e-05, "loss": 2.0524, "step": 420 }, { "epoch": 0.2548148148148148, "grad_norm": 0.7648348808288574, "learning_rate": 8.6e-05, "loss": 2.0836, "step": 430 }, { "epoch": 0.2607407407407407, "grad_norm": 0.7717330455780029, "learning_rate": 8.800000000000001e-05, "loss": 2.0693, "step": 440 }, { "epoch": 0.26666666666666666, "grad_norm": 0.6415191292762756, "learning_rate": 9e-05, "loss": 2.0827, "step": 450 }, { "epoch": 0.2725925925925926, "grad_norm": 0.6595004796981812, "learning_rate": 9.200000000000001e-05, "loss": 1.7722, "step": 460 }, { "epoch": 0.2785185185185185, "grad_norm": 0.6670510768890381, "learning_rate": 9.4e-05, "loss": 2.0679, "step": 470 }, { "epoch": 0.28444444444444444, "grad_norm": 0.6167762875556946, "learning_rate": 9.6e-05, "loss": 2.1062, "step": 480 }, { "epoch": 0.2903703703703704, "grad_norm": 0.657123863697052, "learning_rate": 9.8e-05, "loss": 1.9496, "step": 490 }, { "epoch": 0.2962962962962963, "grad_norm": 0.7975080013275146, "learning_rate": 0.0001, "loss": 2.0326, "step": 500 }, { "epoch": 0.3022222222222222, "grad_norm": 0.5824917554855347, "learning_rate": 9.91575400168492e-05, "loss": 1.9284, "step": 510 }, { "epoch": 0.30814814814814817, "grad_norm": 0.608211874961853, "learning_rate": 9.83150800336984e-05, "loss": 1.955, "step": 520 }, { "epoch": 0.31407407407407406, "grad_norm": 0.6209816336631775, "learning_rate": 9.74726200505476e-05, "loss": 2.0276, "step": 530 }, { "epoch": 0.32, "grad_norm": 0.5495818853378296, "learning_rate": 9.66301600673968e-05, "loss": 1.9229, "step": 540 }, { "epoch": 0.32592592592592595, "grad_norm": 0.5053668022155762, "learning_rate": 9.5787700084246e-05, "loss": 1.8056, "step": 550 }, { "epoch": 0.33185185185185184, "grad_norm": 0.518039882183075, "learning_rate": 9.49452401010952e-05, "loss": 1.9308, "step": 560 }, { "epoch": 0.3377777777777778, "grad_norm": 0.6717711687088013, "learning_rate": 9.41027801179444e-05, "loss": 1.9352, "step": 570 }, { "epoch": 0.3437037037037037, "grad_norm": 0.7411617636680603, "learning_rate": 9.32603201347936e-05, "loss": 2.0059, "step": 580 }, { "epoch": 0.3496296296296296, "grad_norm": 0.48582813143730164, "learning_rate": 9.24178601516428e-05, "loss": 1.8693, "step": 590 }, { "epoch": 0.35555555555555557, "grad_norm": 0.5219349265098572, "learning_rate": 9.1575400168492e-05, "loss": 1.9478, "step": 600 }, { "epoch": 0.36148148148148146, "grad_norm": 0.6107354760169983, "learning_rate": 9.07329401853412e-05, "loss": 1.9171, "step": 610 }, { "epoch": 0.3674074074074074, "grad_norm": 0.6376306414604187, "learning_rate": 8.98904802021904e-05, "loss": 2.0132, "step": 620 }, { "epoch": 0.37333333333333335, "grad_norm": 0.5478475093841553, "learning_rate": 8.90480202190396e-05, "loss": 1.863, "step": 630 }, { "epoch": 0.37925925925925924, "grad_norm": 0.5968965291976929, "learning_rate": 8.82055602358888e-05, "loss": 1.9489, "step": 640 }, { "epoch": 0.3851851851851852, "grad_norm": 0.6038496494293213, "learning_rate": 8.736310025273799e-05, "loss": 1.7222, "step": 650 }, { "epoch": 0.39111111111111113, "grad_norm": 0.5897707343101501, "learning_rate": 8.652064026958719e-05, "loss": 1.951, "step": 660 }, { "epoch": 0.397037037037037, "grad_norm": 0.5143914222717285, "learning_rate": 8.567818028643639e-05, "loss": 2.0544, "step": 670 }, { "epoch": 0.40296296296296297, "grad_norm": 0.5237649083137512, "learning_rate": 8.483572030328559e-05, "loss": 2.002, "step": 680 }, { "epoch": 0.4088888888888889, "grad_norm": 0.6607901453971863, "learning_rate": 8.399326032013479e-05, "loss": 1.968, "step": 690 }, { "epoch": 0.4148148148148148, "grad_norm": 0.816532552242279, "learning_rate": 8.3150800336984e-05, "loss": 1.8108, "step": 700 }, { "epoch": 0.42074074074074075, "grad_norm": 0.5817467570304871, "learning_rate": 8.23083403538332e-05, "loss": 1.9755, "step": 710 }, { "epoch": 0.4266666666666667, "grad_norm": 0.5732384324073792, "learning_rate": 8.146588037068241e-05, "loss": 1.9885, "step": 720 }, { "epoch": 0.4325925925925926, "grad_norm": 0.7058317065238953, "learning_rate": 8.06234203875316e-05, "loss": 2.0017, "step": 730 }, { "epoch": 0.43851851851851853, "grad_norm": 0.589218258857727, "learning_rate": 7.97809604043808e-05, "loss": 2.094, "step": 740 }, { "epoch": 0.4444444444444444, "grad_norm": 0.4636491537094116, "learning_rate": 7.893850042123e-05, "loss": 1.8869, "step": 750 }, { "epoch": 0.45037037037037037, "grad_norm": 0.6695517301559448, "learning_rate": 7.80960404380792e-05, "loss": 1.8622, "step": 760 }, { "epoch": 0.4562962962962963, "grad_norm": 0.545862078666687, "learning_rate": 7.72535804549284e-05, "loss": 1.7885, "step": 770 }, { "epoch": 0.4622222222222222, "grad_norm": 0.7133297920227051, "learning_rate": 7.64111204717776e-05, "loss": 2.1167, "step": 780 }, { "epoch": 0.46814814814814815, "grad_norm": 0.5214343667030334, "learning_rate": 7.55686604886268e-05, "loss": 1.9065, "step": 790 }, { "epoch": 0.4740740740740741, "grad_norm": 0.46373251080513, "learning_rate": 7.4726200505476e-05, "loss": 1.8037, "step": 800 }, { "epoch": 0.48, "grad_norm": 0.646965742111206, "learning_rate": 7.388374052232519e-05, "loss": 1.7713, "step": 810 }, { "epoch": 0.48592592592592593, "grad_norm": 0.5974875092506409, "learning_rate": 7.304128053917439e-05, "loss": 1.7497, "step": 820 }, { "epoch": 0.4918518518518519, "grad_norm": 0.5363378524780273, "learning_rate": 7.219882055602359e-05, "loss": 1.8432, "step": 830 }, { "epoch": 0.49777777777777776, "grad_norm": 0.5419060587882996, "learning_rate": 7.135636057287279e-05, "loss": 1.8984, "step": 840 }, { "epoch": 0.5037037037037037, "grad_norm": 0.5365799069404602, "learning_rate": 7.0513900589722e-05, "loss": 1.9747, "step": 850 }, { "epoch": 0.5096296296296297, "grad_norm": 0.5691491365432739, "learning_rate": 6.96714406065712e-05, "loss": 2.0097, "step": 860 }, { "epoch": 0.5155555555555555, "grad_norm": 0.4764457643032074, "learning_rate": 6.88289806234204e-05, "loss": 2.0322, "step": 870 }, { "epoch": 0.5214814814814814, "grad_norm": 0.532008707523346, "learning_rate": 6.79865206402696e-05, "loss": 1.8917, "step": 880 }, { "epoch": 0.5274074074074074, "grad_norm": 0.6209381818771362, "learning_rate": 6.714406065711879e-05, "loss": 1.9184, "step": 890 }, { "epoch": 0.5333333333333333, "grad_norm": 0.5349578857421875, "learning_rate": 6.630160067396799e-05, "loss": 1.8408, "step": 900 }, { "epoch": 0.5392592592592592, "grad_norm": 0.5553748607635498, "learning_rate": 6.545914069081719e-05, "loss": 2.0037, "step": 910 }, { "epoch": 0.5451851851851852, "grad_norm": 0.5438774228096008, "learning_rate": 6.461668070766639e-05, "loss": 1.9709, "step": 920 }, { "epoch": 0.5511111111111111, "grad_norm": 0.5479866862297058, "learning_rate": 6.377422072451559e-05, "loss": 2.0121, "step": 930 }, { "epoch": 0.557037037037037, "grad_norm": 0.479965478181839, "learning_rate": 6.293176074136478e-05, "loss": 1.7643, "step": 940 }, { "epoch": 0.562962962962963, "grad_norm": 0.41314756870269775, "learning_rate": 6.208930075821398e-05, "loss": 1.9746, "step": 950 }, { "epoch": 0.5688888888888889, "grad_norm": 0.5965597629547119, "learning_rate": 6.124684077506318e-05, "loss": 1.9464, "step": 960 }, { "epoch": 0.5748148148148148, "grad_norm": 0.5767338275909424, "learning_rate": 6.0404380791912386e-05, "loss": 1.9955, "step": 970 }, { "epoch": 0.5807407407407408, "grad_norm": 0.5822139978408813, "learning_rate": 5.9561920808761584e-05, "loss": 1.9824, "step": 980 }, { "epoch": 0.5866666666666667, "grad_norm": 0.5352127552032471, "learning_rate": 5.871946082561078e-05, "loss": 2.0282, "step": 990 }, { "epoch": 0.5925925925925926, "grad_norm": 0.4747357964515686, "learning_rate": 5.787700084245998e-05, "loss": 1.9458, "step": 1000 }, { "epoch": 0.5985185185185186, "grad_norm": 0.6914563775062561, "learning_rate": 5.703454085930918e-05, "loss": 1.8725, "step": 1010 }, { "epoch": 0.6044444444444445, "grad_norm": 0.5494988560676575, "learning_rate": 5.6192080876158384e-05, "loss": 1.986, "step": 1020 }, { "epoch": 0.6103703703703703, "grad_norm": 0.5098171830177307, "learning_rate": 5.534962089300758e-05, "loss": 1.8683, "step": 1030 }, { "epoch": 0.6162962962962963, "grad_norm": 0.6599770784378052, "learning_rate": 5.450716090985678e-05, "loss": 1.7654, "step": 1040 }, { "epoch": 0.6222222222222222, "grad_norm": 0.6061760783195496, "learning_rate": 5.366470092670598e-05, "loss": 1.9257, "step": 1050 }, { "epoch": 0.6281481481481481, "grad_norm": 0.4598886966705322, "learning_rate": 5.282224094355518e-05, "loss": 1.8542, "step": 1060 }, { "epoch": 0.6340740740740741, "grad_norm": 0.7132225632667542, "learning_rate": 5.1979780960404376e-05, "loss": 1.878, "step": 1070 }, { "epoch": 0.64, "grad_norm": 0.47084617614746094, "learning_rate": 5.113732097725358e-05, "loss": 1.785, "step": 1080 }, { "epoch": 0.6459259259259259, "grad_norm": 0.4628468453884125, "learning_rate": 5.029486099410278e-05, "loss": 1.6897, "step": 1090 }, { "epoch": 0.6518518518518519, "grad_norm": 0.4679886996746063, "learning_rate": 4.9452401010951984e-05, "loss": 1.7765, "step": 1100 }, { "epoch": 0.6577777777777778, "grad_norm": 0.6136245727539062, "learning_rate": 4.860994102780118e-05, "loss": 1.7136, "step": 1110 }, { "epoch": 0.6637037037037037, "grad_norm": 0.5701176524162292, "learning_rate": 4.776748104465038e-05, "loss": 1.9895, "step": 1120 }, { "epoch": 0.6696296296296296, "grad_norm": 0.5996720194816589, "learning_rate": 4.692502106149958e-05, "loss": 1.9563, "step": 1130 }, { "epoch": 0.6755555555555556, "grad_norm": 0.5536661744117737, "learning_rate": 4.608256107834878e-05, "loss": 1.8388, "step": 1140 }, { "epoch": 0.6814814814814815, "grad_norm": 0.47468143701553345, "learning_rate": 4.5240101095197975e-05, "loss": 1.8644, "step": 1150 }, { "epoch": 0.6874074074074074, "grad_norm": 0.5488613843917847, "learning_rate": 4.439764111204718e-05, "loss": 1.8448, "step": 1160 }, { "epoch": 0.6933333333333334, "grad_norm": 0.5627337098121643, "learning_rate": 4.355518112889638e-05, "loss": 1.8129, "step": 1170 }, { "epoch": 0.6992592592592592, "grad_norm": 0.5869198441505432, "learning_rate": 4.271272114574558e-05, "loss": 1.8029, "step": 1180 }, { "epoch": 0.7051851851851851, "grad_norm": 0.5446140170097351, "learning_rate": 4.1870261162594775e-05, "loss": 1.7783, "step": 1190 }, { "epoch": 0.7111111111111111, "grad_norm": 0.6113932728767395, "learning_rate": 4.102780117944398e-05, "loss": 1.7782, "step": 1200 }, { "epoch": 0.717037037037037, "grad_norm": 0.49301135540008545, "learning_rate": 4.018534119629318e-05, "loss": 1.7719, "step": 1210 }, { "epoch": 0.7229629629629629, "grad_norm": 0.6062334179878235, "learning_rate": 3.934288121314238e-05, "loss": 1.8681, "step": 1220 }, { "epoch": 0.7288888888888889, "grad_norm": 0.4924413561820984, "learning_rate": 3.850042122999158e-05, "loss": 1.7276, "step": 1230 }, { "epoch": 0.7348148148148148, "grad_norm": 0.5645489692687988, "learning_rate": 3.765796124684078e-05, "loss": 1.8095, "step": 1240 }, { "epoch": 0.7407407407407407, "grad_norm": 0.5494862198829651, "learning_rate": 3.681550126368998e-05, "loss": 1.9272, "step": 1250 }, { "epoch": 0.7466666666666667, "grad_norm": 0.45363616943359375, "learning_rate": 3.597304128053918e-05, "loss": 1.6991, "step": 1260 }, { "epoch": 0.7525925925925926, "grad_norm": 0.5172736644744873, "learning_rate": 3.5130581297388375e-05, "loss": 1.6942, "step": 1270 }, { "epoch": 0.7585185185185185, "grad_norm": 0.5237290859222412, "learning_rate": 3.4288121314237574e-05, "loss": 1.834, "step": 1280 }, { "epoch": 0.7644444444444445, "grad_norm": 0.685662031173706, "learning_rate": 3.344566133108677e-05, "loss": 1.6324, "step": 1290 }, { "epoch": 0.7703703703703704, "grad_norm": 0.5649334192276001, "learning_rate": 3.260320134793598e-05, "loss": 1.9939, "step": 1300 }, { "epoch": 0.7762962962962963, "grad_norm": 0.5103081464767456, "learning_rate": 3.1760741364785175e-05, "loss": 1.7158, "step": 1310 }, { "epoch": 0.7822222222222223, "grad_norm": 0.5990691781044006, "learning_rate": 3.0918281381634374e-05, "loss": 1.8249, "step": 1320 }, { "epoch": 0.7881481481481482, "grad_norm": 0.4889729917049408, "learning_rate": 3.0075821398483572e-05, "loss": 2.0304, "step": 1330 }, { "epoch": 0.794074074074074, "grad_norm": 0.5077411532402039, "learning_rate": 2.923336141533277e-05, "loss": 1.8993, "step": 1340 }, { "epoch": 0.8, "grad_norm": 0.4161909818649292, "learning_rate": 2.8390901432181972e-05, "loss": 1.8544, "step": 1350 }, { "epoch": 0.8059259259259259, "grad_norm": 0.3950774669647217, "learning_rate": 2.754844144903117e-05, "loss": 1.9056, "step": 1360 }, { "epoch": 0.8118518518518518, "grad_norm": 0.4794867932796478, "learning_rate": 2.670598146588037e-05, "loss": 1.9091, "step": 1370 }, { "epoch": 0.8177777777777778, "grad_norm": 0.5798467397689819, "learning_rate": 2.586352148272957e-05, "loss": 1.7677, "step": 1380 }, { "epoch": 0.8237037037037037, "grad_norm": 0.42939475178718567, "learning_rate": 2.502106149957877e-05, "loss": 1.4855, "step": 1390 }, { "epoch": 0.8296296296296296, "grad_norm": 0.5029926300048828, "learning_rate": 2.417860151642797e-05, "loss": 1.7523, "step": 1400 }, { "epoch": 0.8355555555555556, "grad_norm": 0.5045111775398254, "learning_rate": 2.333614153327717e-05, "loss": 1.7036, "step": 1410 }, { "epoch": 0.8414814814814815, "grad_norm": 0.45949357748031616, "learning_rate": 2.249368155012637e-05, "loss": 1.7477, "step": 1420 }, { "epoch": 0.8474074074074074, "grad_norm": 0.6068927645683289, "learning_rate": 2.165122156697557e-05, "loss": 1.7256, "step": 1430 }, { "epoch": 0.8533333333333334, "grad_norm": 0.47375020384788513, "learning_rate": 2.080876158382477e-05, "loss": 1.6821, "step": 1440 }, { "epoch": 0.8592592592592593, "grad_norm": 0.6090928316116333, "learning_rate": 1.996630160067397e-05, "loss": 1.7821, "step": 1450 }, { "epoch": 0.8651851851851852, "grad_norm": 0.4479965567588806, "learning_rate": 1.912384161752317e-05, "loss": 1.7752, "step": 1460 }, { "epoch": 0.8711111111111111, "grad_norm": 0.502055287361145, "learning_rate": 1.828138163437237e-05, "loss": 1.7502, "step": 1470 }, { "epoch": 0.8770370370370371, "grad_norm": 0.5576585531234741, "learning_rate": 1.7438921651221567e-05, "loss": 1.7943, "step": 1480 }, { "epoch": 0.882962962962963, "grad_norm": 0.443153977394104, "learning_rate": 1.659646166807077e-05, "loss": 1.7002, "step": 1490 }, { "epoch": 0.8888888888888888, "grad_norm": 0.5595043301582336, "learning_rate": 1.5754001684919967e-05, "loss": 1.8032, "step": 1500 }, { "epoch": 0.8948148148148148, "grad_norm": 0.5037031769752502, "learning_rate": 1.4911541701769167e-05, "loss": 1.9339, "step": 1510 }, { "epoch": 0.9007407407407407, "grad_norm": 0.592262327671051, "learning_rate": 1.4069081718618365e-05, "loss": 1.7636, "step": 1520 }, { "epoch": 0.9066666666666666, "grad_norm": 0.48528340458869934, "learning_rate": 1.3226621735467565e-05, "loss": 1.5757, "step": 1530 }, { "epoch": 0.9125925925925926, "grad_norm": 0.5274297595024109, "learning_rate": 1.2384161752316765e-05, "loss": 1.7721, "step": 1540 }, { "epoch": 0.9185185185185185, "grad_norm": 0.5855200886726379, "learning_rate": 1.1541701769165965e-05, "loss": 1.8039, "step": 1550 }, { "epoch": 0.9244444444444444, "grad_norm": 0.46873489022254944, "learning_rate": 1.0699241786015165e-05, "loss": 1.7628, "step": 1560 }, { "epoch": 0.9303703703703704, "grad_norm": 0.4692378044128418, "learning_rate": 9.856781802864365e-06, "loss": 1.6898, "step": 1570 }, { "epoch": 0.9362962962962963, "grad_norm": 0.4950474500656128, "learning_rate": 9.014321819713565e-06, "loss": 1.882, "step": 1580 }, { "epoch": 0.9422222222222222, "grad_norm": 0.5897673964500427, "learning_rate": 8.171861836562763e-06, "loss": 1.8277, "step": 1590 }, { "epoch": 0.9481481481481482, "grad_norm": 0.5677273273468018, "learning_rate": 7.329401853411963e-06, "loss": 1.771, "step": 1600 }, { "epoch": 0.9540740740740741, "grad_norm": 1.664191484451294, "learning_rate": 6.4869418702611624e-06, "loss": 1.7564, "step": 1610 }, { "epoch": 0.96, "grad_norm": 0.45749393105506897, "learning_rate": 5.644481887110362e-06, "loss": 1.7552, "step": 1620 }, { "epoch": 0.965925925925926, "grad_norm": 0.6322038769721985, "learning_rate": 4.802021903959562e-06, "loss": 2.0075, "step": 1630 }, { "epoch": 0.9718518518518519, "grad_norm": 0.5579991340637207, "learning_rate": 3.9595619208087616e-06, "loss": 1.8782, "step": 1640 }, { "epoch": 0.9777777777777777, "grad_norm": 0.48121675848960876, "learning_rate": 3.1171019376579616e-06, "loss": 1.7393, "step": 1650 }, { "epoch": 0.9837037037037037, "grad_norm": 0.5826757550239563, "learning_rate": 2.274641954507161e-06, "loss": 2.0148, "step": 1660 }, { "epoch": 0.9896296296296296, "grad_norm": 0.513190746307373, "learning_rate": 1.4321819713563607e-06, "loss": 1.6227, "step": 1670 }, { "epoch": 0.9955555555555555, "grad_norm": 0.5289293527603149, "learning_rate": 5.897219882055603e-07, "loss": 1.7117, "step": 1680 } ], "logging_steps": 10, "max_steps": 1687, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.639446638618214e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }