{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9954430379746837, "eval_steps": 500, "global_step": 1479, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020253164556962026, "grad_norm": 4.742581199201631, "learning_rate": 6.756756756756758e-07, "loss": 0.8238, "step": 10 }, { "epoch": 0.04050632911392405, "grad_norm": 2.7089177182440225, "learning_rate": 1.3513513513513515e-06, "loss": 0.7256, "step": 20 }, { "epoch": 0.060759493670886074, "grad_norm": 1.6438330526897271, "learning_rate": 2.0270270270270273e-06, "loss": 0.6705, "step": 30 }, { "epoch": 0.0810126582278481, "grad_norm": 1.9649059006708485, "learning_rate": 2.702702702702703e-06, "loss": 0.6382, "step": 40 }, { "epoch": 0.10126582278481013, "grad_norm": 1.8319812554661556, "learning_rate": 3.3783783783783788e-06, "loss": 0.62, "step": 50 }, { "epoch": 0.12151898734177215, "grad_norm": 1.8563981143868757, "learning_rate": 4.0540540540540545e-06, "loss": 0.6097, "step": 60 }, { "epoch": 0.14177215189873418, "grad_norm": 2.0369557567075196, "learning_rate": 4.72972972972973e-06, "loss": 0.6039, "step": 70 }, { "epoch": 0.1620253164556962, "grad_norm": 3.933193428286484, "learning_rate": 4.978647686832741e-06, "loss": 0.5981, "step": 80 }, { "epoch": 0.18227848101265823, "grad_norm": 3.961786031844899, "learning_rate": 4.943060498220641e-06, "loss": 0.5957, "step": 90 }, { "epoch": 0.20253164556962025, "grad_norm": 3.016653386179453, "learning_rate": 4.907473309608541e-06, "loss": 0.5966, "step": 100 }, { "epoch": 0.22278481012658227, "grad_norm": 2.0911611009486664, "learning_rate": 4.8718861209964415e-06, "loss": 0.5874, "step": 110 }, { "epoch": 0.2430379746835443, "grad_norm": 2.3577222635919877, "learning_rate": 4.836298932384342e-06, "loss": 0.5851, "step": 120 }, { "epoch": 0.26329113924050634, "grad_norm": 2.4189498060173826, "learning_rate": 4.800711743772242e-06, "loss": 0.5839, "step": 130 }, { "epoch": 0.28354430379746837, "grad_norm": 1.9667579738688012, "learning_rate": 4.765124555160142e-06, "loss": 0.573, "step": 140 }, { "epoch": 0.3037974683544304, "grad_norm": 1.369959691403867, "learning_rate": 4.7295373665480435e-06, "loss": 0.5838, "step": 150 }, { "epoch": 0.3240506329113924, "grad_norm": 1.4080935903120366, "learning_rate": 4.693950177935944e-06, "loss": 0.5826, "step": 160 }, { "epoch": 0.34430379746835443, "grad_norm": 1.375777676837261, "learning_rate": 4.658362989323843e-06, "loss": 0.5827, "step": 170 }, { "epoch": 0.36455696202531646, "grad_norm": 1.6546345406257401, "learning_rate": 4.622775800711744e-06, "loss": 0.5783, "step": 180 }, { "epoch": 0.3848101265822785, "grad_norm": 2.138184576302769, "learning_rate": 4.587188612099645e-06, "loss": 0.578, "step": 190 }, { "epoch": 0.4050632911392405, "grad_norm": 1.5042114599648924, "learning_rate": 4.551601423487545e-06, "loss": 0.5713, "step": 200 }, { "epoch": 0.4253164556962025, "grad_norm": 2.34307337846717, "learning_rate": 4.516014234875445e-06, "loss": 0.5659, "step": 210 }, { "epoch": 0.44556962025316454, "grad_norm": 2.2965660467623983, "learning_rate": 4.4804270462633455e-06, "loss": 0.5711, "step": 220 }, { "epoch": 0.46582278481012657, "grad_norm": 2.2958689571859363, "learning_rate": 4.444839857651246e-06, "loss": 0.5693, "step": 230 }, { "epoch": 0.4860759493670886, "grad_norm": 2.417004138332344, "learning_rate": 4.409252669039146e-06, "loss": 0.5646, "step": 240 }, { "epoch": 0.5063291139240507, "grad_norm": 2.12941087663038, "learning_rate": 4.373665480427046e-06, "loss": 0.5681, "step": 250 }, { "epoch": 0.5265822784810127, "grad_norm": 1.8555366173965417, "learning_rate": 4.3380782918149475e-06, "loss": 0.5543, "step": 260 }, { "epoch": 0.5468354430379747, "grad_norm": 1.5852520856985666, "learning_rate": 4.302491103202847e-06, "loss": 0.5639, "step": 270 }, { "epoch": 0.5670886075949367, "grad_norm": 1.460016106438118, "learning_rate": 4.266903914590747e-06, "loss": 0.5593, "step": 280 }, { "epoch": 0.5873417721518988, "grad_norm": 1.7121558558722028, "learning_rate": 4.231316725978648e-06, "loss": 0.5608, "step": 290 }, { "epoch": 0.6075949367088608, "grad_norm": 1.3067137647800322, "learning_rate": 4.195729537366549e-06, "loss": 0.5607, "step": 300 }, { "epoch": 0.6278481012658228, "grad_norm": 1.418773706013845, "learning_rate": 4.160142348754449e-06, "loss": 0.5666, "step": 310 }, { "epoch": 0.6481012658227848, "grad_norm": 1.4267499965709702, "learning_rate": 4.124555160142349e-06, "loss": 0.5629, "step": 320 }, { "epoch": 0.6683544303797468, "grad_norm": 1.3749578870885502, "learning_rate": 4.0889679715302495e-06, "loss": 0.5619, "step": 330 }, { "epoch": 0.6886075949367089, "grad_norm": 1.275893799768224, "learning_rate": 4.05338078291815e-06, "loss": 0.5611, "step": 340 }, { "epoch": 0.7088607594936709, "grad_norm": 1.2613374964537525, "learning_rate": 4.01779359430605e-06, "loss": 0.5625, "step": 350 }, { "epoch": 0.7291139240506329, "grad_norm": 1.282128305662025, "learning_rate": 3.98220640569395e-06, "loss": 0.5647, "step": 360 }, { "epoch": 0.7493670886075949, "grad_norm": 1.2183620482413273, "learning_rate": 3.946619217081851e-06, "loss": 0.557, "step": 370 }, { "epoch": 0.769620253164557, "grad_norm": 1.1816403948954313, "learning_rate": 3.911032028469751e-06, "loss": 0.5535, "step": 380 }, { "epoch": 0.789873417721519, "grad_norm": 1.343065511383417, "learning_rate": 3.875444839857651e-06, "loss": 0.5564, "step": 390 }, { "epoch": 0.810126582278481, "grad_norm": 1.1056411836705247, "learning_rate": 3.839857651245552e-06, "loss": 0.5562, "step": 400 }, { "epoch": 0.830379746835443, "grad_norm": 1.1617514638384954, "learning_rate": 3.804270462633452e-06, "loss": 0.5509, "step": 410 }, { "epoch": 0.850632911392405, "grad_norm": 1.2456514736887305, "learning_rate": 3.7686832740213525e-06, "loss": 0.5589, "step": 420 }, { "epoch": 0.8708860759493671, "grad_norm": 1.0959898492388527, "learning_rate": 3.733096085409253e-06, "loss": 0.5521, "step": 430 }, { "epoch": 0.8911392405063291, "grad_norm": 1.295665638247925, "learning_rate": 3.6975088967971536e-06, "loss": 0.5572, "step": 440 }, { "epoch": 0.9113924050632911, "grad_norm": 1.0535123677725715, "learning_rate": 3.661921708185054e-06, "loss": 0.5482, "step": 450 }, { "epoch": 0.9316455696202531, "grad_norm": 1.0683406426734499, "learning_rate": 3.6263345195729537e-06, "loss": 0.5525, "step": 460 }, { "epoch": 0.9518987341772152, "grad_norm": 1.253383608055468, "learning_rate": 3.5907473309608544e-06, "loss": 0.5533, "step": 470 }, { "epoch": 0.9721518987341772, "grad_norm": 1.2809747280599841, "learning_rate": 3.5551601423487547e-06, "loss": 0.5505, "step": 480 }, { "epoch": 0.9924050632911392, "grad_norm": 1.1325843807943288, "learning_rate": 3.519572953736655e-06, "loss": 0.5465, "step": 490 }, { "epoch": 0.9984810126582279, "eval_loss": 0.068679079413414, "eval_runtime": 507.7278, "eval_samples_per_second": 26.203, "eval_steps_per_second": 0.41, "step": 493 }, { "epoch": 1.0126582278481013, "grad_norm": 1.7325407692218586, "learning_rate": 3.4839857651245557e-06, "loss": 0.5125, "step": 500 }, { "epoch": 1.0329113924050632, "grad_norm": 1.9559729603669886, "learning_rate": 3.4483985765124556e-06, "loss": 0.4812, "step": 510 }, { "epoch": 1.0531645569620254, "grad_norm": 1.4932963289367234, "learning_rate": 3.412811387900356e-06, "loss": 0.4774, "step": 520 }, { "epoch": 1.0734177215189873, "grad_norm": 1.185384048798669, "learning_rate": 3.3772241992882566e-06, "loss": 0.4785, "step": 530 }, { "epoch": 1.0936708860759494, "grad_norm": 1.1972600486999827, "learning_rate": 3.341637010676157e-06, "loss": 0.4758, "step": 540 }, { "epoch": 1.1139240506329113, "grad_norm": 1.1460022054980692, "learning_rate": 3.3060498220640576e-06, "loss": 0.4778, "step": 550 }, { "epoch": 1.1341772151898735, "grad_norm": 1.1197932329811855, "learning_rate": 3.2704626334519575e-06, "loss": 0.4752, "step": 560 }, { "epoch": 1.1544303797468354, "grad_norm": 1.194566549573746, "learning_rate": 3.2348754448398577e-06, "loss": 0.4818, "step": 570 }, { "epoch": 1.1746835443037975, "grad_norm": 1.2784411634842332, "learning_rate": 3.1992882562277585e-06, "loss": 0.4798, "step": 580 }, { "epoch": 1.1949367088607594, "grad_norm": 1.123327911933, "learning_rate": 3.1637010676156587e-06, "loss": 0.4736, "step": 590 }, { "epoch": 1.2151898734177216, "grad_norm": 1.193089413979018, "learning_rate": 3.128113879003559e-06, "loss": 0.4804, "step": 600 }, { "epoch": 1.2354430379746835, "grad_norm": 1.1785101089717391, "learning_rate": 3.0925266903914593e-06, "loss": 0.4762, "step": 610 }, { "epoch": 1.2556962025316456, "grad_norm": 1.2080168650481182, "learning_rate": 3.0569395017793596e-06, "loss": 0.4798, "step": 620 }, { "epoch": 1.2759493670886077, "grad_norm": 1.3205102063888547, "learning_rate": 3.02135231316726e-06, "loss": 0.4796, "step": 630 }, { "epoch": 1.2962025316455696, "grad_norm": 1.3147319714200894, "learning_rate": 2.9857651245551606e-06, "loss": 0.4787, "step": 640 }, { "epoch": 1.3164556962025316, "grad_norm": 1.1490461889736139, "learning_rate": 2.950177935943061e-06, "loss": 0.482, "step": 650 }, { "epoch": 1.3367088607594937, "grad_norm": 1.3228852312739667, "learning_rate": 2.9145907473309608e-06, "loss": 0.4705, "step": 660 }, { "epoch": 1.3569620253164558, "grad_norm": 1.2638864667189769, "learning_rate": 2.8790035587188615e-06, "loss": 0.4839, "step": 670 }, { "epoch": 1.3772151898734177, "grad_norm": 1.1378222725461256, "learning_rate": 2.8434163701067618e-06, "loss": 0.4807, "step": 680 }, { "epoch": 1.3974683544303796, "grad_norm": 1.136931180029992, "learning_rate": 2.8078291814946625e-06, "loss": 0.4793, "step": 690 }, { "epoch": 1.4177215189873418, "grad_norm": 1.2382582028693814, "learning_rate": 2.7722419928825624e-06, "loss": 0.476, "step": 700 }, { "epoch": 1.437974683544304, "grad_norm": 1.2304522818128008, "learning_rate": 2.7366548042704626e-06, "loss": 0.4813, "step": 710 }, { "epoch": 1.4582278481012658, "grad_norm": 1.123509723565589, "learning_rate": 2.7010676156583634e-06, "loss": 0.4809, "step": 720 }, { "epoch": 1.4784810126582277, "grad_norm": 1.1387958282357735, "learning_rate": 2.6654804270462636e-06, "loss": 0.4833, "step": 730 }, { "epoch": 1.4987341772151899, "grad_norm": 1.118649730450664, "learning_rate": 2.629893238434164e-06, "loss": 0.4758, "step": 740 }, { "epoch": 1.518987341772152, "grad_norm": 1.222896592137077, "learning_rate": 2.5943060498220642e-06, "loss": 0.4779, "step": 750 }, { "epoch": 1.539240506329114, "grad_norm": 1.4074398237958103, "learning_rate": 2.5587188612099645e-06, "loss": 0.4788, "step": 760 }, { "epoch": 1.5594936708860758, "grad_norm": 1.1619300400485018, "learning_rate": 2.523131672597865e-06, "loss": 0.4772, "step": 770 }, { "epoch": 1.579746835443038, "grad_norm": 1.422184034548514, "learning_rate": 2.4875444839857655e-06, "loss": 0.4778, "step": 780 }, { "epoch": 1.6, "grad_norm": 1.2234336626816806, "learning_rate": 2.451957295373666e-06, "loss": 0.4792, "step": 790 }, { "epoch": 1.620253164556962, "grad_norm": 1.1332543250276066, "learning_rate": 2.416370106761566e-06, "loss": 0.4753, "step": 800 }, { "epoch": 1.640506329113924, "grad_norm": 1.2069589755433405, "learning_rate": 2.3807829181494664e-06, "loss": 0.4683, "step": 810 }, { "epoch": 1.660759493670886, "grad_norm": 1.106986539594195, "learning_rate": 2.3451957295373667e-06, "loss": 0.4773, "step": 820 }, { "epoch": 1.6810126582278482, "grad_norm": 1.1080989558058947, "learning_rate": 2.3096085409252674e-06, "loss": 0.472, "step": 830 }, { "epoch": 1.70126582278481, "grad_norm": 1.1905792829948112, "learning_rate": 2.2740213523131673e-06, "loss": 0.4734, "step": 840 }, { "epoch": 1.721518987341772, "grad_norm": 1.1263868733328295, "learning_rate": 2.238434163701068e-06, "loss": 0.4755, "step": 850 }, { "epoch": 1.7417721518987341, "grad_norm": 1.033205957803366, "learning_rate": 2.2028469750889683e-06, "loss": 0.475, "step": 860 }, { "epoch": 1.7620253164556963, "grad_norm": 1.127189794873212, "learning_rate": 2.1672597864768685e-06, "loss": 0.4785, "step": 870 }, { "epoch": 1.7822784810126582, "grad_norm": 1.1788992895844184, "learning_rate": 2.131672597864769e-06, "loss": 0.479, "step": 880 }, { "epoch": 1.80253164556962, "grad_norm": 1.186766190893949, "learning_rate": 2.096085409252669e-06, "loss": 0.4763, "step": 890 }, { "epoch": 1.8227848101265822, "grad_norm": 1.1866935971232218, "learning_rate": 2.06049822064057e-06, "loss": 0.4751, "step": 900 }, { "epoch": 1.8430379746835444, "grad_norm": 1.2254733687693131, "learning_rate": 2.0249110320284697e-06, "loss": 0.4767, "step": 910 }, { "epoch": 1.8632911392405065, "grad_norm": 1.0999643877142318, "learning_rate": 1.9893238434163704e-06, "loss": 0.475, "step": 920 }, { "epoch": 1.8835443037974684, "grad_norm": 1.1191194093430747, "learning_rate": 1.9537366548042703e-06, "loss": 0.4714, "step": 930 }, { "epoch": 1.9037974683544303, "grad_norm": 1.1342720462254603, "learning_rate": 1.918149466192171e-06, "loss": 0.4751, "step": 940 }, { "epoch": 1.9240506329113924, "grad_norm": 1.0241324911004035, "learning_rate": 1.8825622775800715e-06, "loss": 0.4759, "step": 950 }, { "epoch": 1.9443037974683546, "grad_norm": 1.1373301367884234, "learning_rate": 1.8469750889679716e-06, "loss": 0.4795, "step": 960 }, { "epoch": 1.9645569620253165, "grad_norm": 1.047662657429949, "learning_rate": 1.811387900355872e-06, "loss": 0.4778, "step": 970 }, { "epoch": 1.9848101265822784, "grad_norm": 1.0623876456662695, "learning_rate": 1.7758007117437724e-06, "loss": 0.4778, "step": 980 }, { "epoch": 1.998987341772152, "eval_loss": 0.06775479018688202, "eval_runtime": 509.7296, "eval_samples_per_second": 26.1, "eval_steps_per_second": 0.408, "step": 987 }, { "epoch": 2.0050632911392405, "grad_norm": 1.885410249625527, "learning_rate": 1.7402135231316727e-06, "loss": 0.456, "step": 990 }, { "epoch": 2.0253164556962027, "grad_norm": 1.4856083185892484, "learning_rate": 1.7046263345195732e-06, "loss": 0.4107, "step": 1000 }, { "epoch": 2.0455696202531644, "grad_norm": 1.309133994163287, "learning_rate": 1.6690391459074735e-06, "loss": 0.4093, "step": 1010 }, { "epoch": 2.0658227848101265, "grad_norm": 1.1938535418758685, "learning_rate": 1.633451957295374e-06, "loss": 0.4054, "step": 1020 }, { "epoch": 2.0860759493670886, "grad_norm": 1.107962295104452, "learning_rate": 1.597864768683274e-06, "loss": 0.4053, "step": 1030 }, { "epoch": 2.1063291139240508, "grad_norm": 1.2052202862963048, "learning_rate": 1.5622775800711745e-06, "loss": 0.4024, "step": 1040 }, { "epoch": 2.1265822784810124, "grad_norm": 1.1688527474064736, "learning_rate": 1.526690391459075e-06, "loss": 0.411, "step": 1050 }, { "epoch": 2.1468354430379746, "grad_norm": 1.159582820630888, "learning_rate": 1.4911032028469751e-06, "loss": 0.4056, "step": 1060 }, { "epoch": 2.1670886075949367, "grad_norm": 1.1801911147741104, "learning_rate": 1.4555160142348756e-06, "loss": 0.4079, "step": 1070 }, { "epoch": 2.187341772151899, "grad_norm": 1.1866560972006943, "learning_rate": 1.419928825622776e-06, "loss": 0.4061, "step": 1080 }, { "epoch": 2.207594936708861, "grad_norm": 1.1271721136927093, "learning_rate": 1.3843416370106764e-06, "loss": 0.4035, "step": 1090 }, { "epoch": 2.2278481012658227, "grad_norm": 1.148968106587725, "learning_rate": 1.3487544483985765e-06, "loss": 0.412, "step": 1100 }, { "epoch": 2.248101265822785, "grad_norm": 1.17444595906901, "learning_rate": 1.313167259786477e-06, "loss": 0.4078, "step": 1110 }, { "epoch": 2.268354430379747, "grad_norm": 1.241550247309238, "learning_rate": 1.2775800711743775e-06, "loss": 0.4091, "step": 1120 }, { "epoch": 2.2886075949367086, "grad_norm": 1.1322921512574697, "learning_rate": 1.2419928825622776e-06, "loss": 0.41, "step": 1130 }, { "epoch": 2.3088607594936708, "grad_norm": 1.1038688400048324, "learning_rate": 1.206405693950178e-06, "loss": 0.4052, "step": 1140 }, { "epoch": 2.329113924050633, "grad_norm": 1.1471785710291027, "learning_rate": 1.1708185053380784e-06, "loss": 0.4116, "step": 1150 }, { "epoch": 2.349367088607595, "grad_norm": 1.1748513395840992, "learning_rate": 1.1352313167259789e-06, "loss": 0.4104, "step": 1160 }, { "epoch": 2.369620253164557, "grad_norm": 1.1687832541778012, "learning_rate": 1.0996441281138791e-06, "loss": 0.4103, "step": 1170 }, { "epoch": 2.389873417721519, "grad_norm": 1.2153707569716299, "learning_rate": 1.0640569395017794e-06, "loss": 0.4102, "step": 1180 }, { "epoch": 2.410126582278481, "grad_norm": 1.2043915036892876, "learning_rate": 1.0284697508896797e-06, "loss": 0.4093, "step": 1190 }, { "epoch": 2.430379746835443, "grad_norm": 1.1154060559605754, "learning_rate": 9.928825622775802e-07, "loss": 0.4077, "step": 1200 }, { "epoch": 2.4506329113924052, "grad_norm": 1.1249500649551158, "learning_rate": 9.572953736654805e-07, "loss": 0.4067, "step": 1210 }, { "epoch": 2.470886075949367, "grad_norm": 1.1373080270061644, "learning_rate": 9.217081850533809e-07, "loss": 0.4094, "step": 1220 }, { "epoch": 2.491139240506329, "grad_norm": 1.2009940834893456, "learning_rate": 8.861209964412812e-07, "loss": 0.4081, "step": 1230 }, { "epoch": 2.511392405063291, "grad_norm": 1.1805699402158027, "learning_rate": 8.505338078291815e-07, "loss": 0.412, "step": 1240 }, { "epoch": 2.5316455696202533, "grad_norm": 1.1650230538613433, "learning_rate": 8.14946619217082e-07, "loss": 0.4053, "step": 1250 }, { "epoch": 2.5518987341772155, "grad_norm": 1.1148652618881494, "learning_rate": 7.793594306049823e-07, "loss": 0.4003, "step": 1260 }, { "epoch": 2.572151898734177, "grad_norm": 1.1203790714411743, "learning_rate": 7.437722419928827e-07, "loss": 0.4071, "step": 1270 }, { "epoch": 2.5924050632911393, "grad_norm": 1.1540738131106723, "learning_rate": 7.08185053380783e-07, "loss": 0.4088, "step": 1280 }, { "epoch": 2.6126582278481014, "grad_norm": 1.1564560350860578, "learning_rate": 6.725978647686833e-07, "loss": 0.4136, "step": 1290 }, { "epoch": 2.632911392405063, "grad_norm": 1.1502008333731566, "learning_rate": 6.370106761565837e-07, "loss": 0.4074, "step": 1300 }, { "epoch": 2.6531645569620252, "grad_norm": 1.084342294241873, "learning_rate": 6.01423487544484e-07, "loss": 0.4068, "step": 1310 }, { "epoch": 2.6734177215189874, "grad_norm": 1.0975552769986037, "learning_rate": 5.658362989323843e-07, "loss": 0.408, "step": 1320 }, { "epoch": 2.6936708860759495, "grad_norm": 1.1097418341903917, "learning_rate": 5.302491103202847e-07, "loss": 0.4103, "step": 1330 }, { "epoch": 2.7139240506329116, "grad_norm": 1.158009570288043, "learning_rate": 4.946619217081851e-07, "loss": 0.4086, "step": 1340 }, { "epoch": 2.7341772151898733, "grad_norm": 1.082032638132166, "learning_rate": 4.5907473309608547e-07, "loss": 0.4067, "step": 1350 }, { "epoch": 2.7544303797468355, "grad_norm": 1.0910706188001162, "learning_rate": 4.234875444839858e-07, "loss": 0.4025, "step": 1360 }, { "epoch": 2.7746835443037976, "grad_norm": 1.1340308917418294, "learning_rate": 3.8790035587188615e-07, "loss": 0.4074, "step": 1370 }, { "epoch": 2.7949367088607593, "grad_norm": 1.0972217494408671, "learning_rate": 3.5231316725978655e-07, "loss": 0.4059, "step": 1380 }, { "epoch": 2.8151898734177214, "grad_norm": 1.1023421234405986, "learning_rate": 3.1672597864768684e-07, "loss": 0.408, "step": 1390 }, { "epoch": 2.8354430379746836, "grad_norm": 1.076348844263821, "learning_rate": 2.811387900355872e-07, "loss": 0.4008, "step": 1400 }, { "epoch": 2.8556962025316457, "grad_norm": 1.0727773827003273, "learning_rate": 2.455516014234876e-07, "loss": 0.4095, "step": 1410 }, { "epoch": 2.875949367088608, "grad_norm": 1.094488859203195, "learning_rate": 2.0996441281138792e-07, "loss": 0.4044, "step": 1420 }, { "epoch": 2.8962025316455695, "grad_norm": 1.1022638043714281, "learning_rate": 1.743772241992883e-07, "loss": 0.4053, "step": 1430 }, { "epoch": 2.9164556962025316, "grad_norm": 1.056288437679614, "learning_rate": 1.3879003558718863e-07, "loss": 0.407, "step": 1440 }, { "epoch": 2.9367088607594938, "grad_norm": 1.1311358899472859, "learning_rate": 1.0320284697508897e-07, "loss": 0.4152, "step": 1450 }, { "epoch": 2.9569620253164555, "grad_norm": 1.0577864076415648, "learning_rate": 6.761565836298933e-08, "loss": 0.4032, "step": 1460 }, { "epoch": 2.9772151898734176, "grad_norm": 1.0913097767086777, "learning_rate": 3.202846975088968e-08, "loss": 0.4067, "step": 1470 }, { "epoch": 2.9954430379746837, "eval_loss": 0.0700838565826416, "eval_runtime": 511.013, "eval_samples_per_second": 26.035, "eval_steps_per_second": 0.407, "step": 1479 }, { "epoch": 2.9954430379746837, "step": 1479, "total_flos": 2477170706350080.0, "train_loss": 0.4888964453768779, "train_runtime": 84815.3932, "train_samples_per_second": 8.94, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 1479, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2477170706350080.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }