{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995988768551946, "eval_steps": 100, "global_step": 623, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016044925792218213, "grad_norm": 0.07964161388964953, "learning_rate": 1.5873015873015873e-06, "loss": 1.478, "step": 10 }, { "epoch": 0.032089851584436425, "grad_norm": 0.08446800204864474, "learning_rate": 3.1746031746031746e-06, "loss": 1.4734, "step": 20 }, { "epoch": 0.048134777376654635, "grad_norm": 0.0880665470615365, "learning_rate": 4.761904761904762e-06, "loss": 1.4686, "step": 30 }, { "epoch": 0.06417970316887285, "grad_norm": 0.11036773606126081, "learning_rate": 6.349206349206349e-06, "loss": 1.4799, "step": 40 }, { "epoch": 0.08022462896109106, "grad_norm": 0.1984820440725813, "learning_rate": 7.936507936507936e-06, "loss": 1.4712, "step": 50 }, { "epoch": 0.09626955475330927, "grad_norm": 0.3636876671405763, "learning_rate": 9.523809523809525e-06, "loss": 1.4479, "step": 60 }, { "epoch": 0.11231448054552748, "grad_norm": 0.33961243491902277, "learning_rate": 9.996145181203616e-06, "loss": 1.3449, "step": 70 }, { "epoch": 0.1283594063377457, "grad_norm": 0.11623936014320749, "learning_rate": 9.977278743495434e-06, "loss": 1.3048, "step": 80 }, { "epoch": 0.1444043321299639, "grad_norm": 0.09934569363354419, "learning_rate": 9.942751945444437e-06, "loss": 1.2392, "step": 90 }, { "epoch": 0.16044925792218212, "grad_norm": 0.0749779795039777, "learning_rate": 9.892673421130979e-06, "loss": 1.2155, "step": 100 }, { "epoch": 0.16044925792218212, "eval_loss": 1.2045714855194092, "eval_runtime": 36.9278, "eval_samples_per_second": 5.47, "eval_steps_per_second": 1.381, "step": 100 }, { "epoch": 0.17649418371440032, "grad_norm": 0.06497134561472019, "learning_rate": 9.827200736119815e-06, "loss": 1.2129, "step": 110 }, { "epoch": 0.19253910950661854, "grad_norm": 0.06149214025937759, "learning_rate": 9.746539891700558e-06, "loss": 1.1695, "step": 120 }, { "epoch": 0.20858403529883673, "grad_norm": 0.05334876254417194, "learning_rate": 9.650944676731383e-06, "loss": 1.162, "step": 130 }, { "epoch": 0.22462896109105496, "grad_norm": 0.04556668435804944, "learning_rate": 9.540715869125407e-06, "loss": 1.1491, "step": 140 }, { "epoch": 0.24067388688327315, "grad_norm": 0.04356126220951498, "learning_rate": 9.416200289492092e-06, "loss": 1.1358, "step": 150 }, { "epoch": 0.2567188126754914, "grad_norm": 0.03723127058817997, "learning_rate": 9.27778970991129e-06, "loss": 1.1338, "step": 160 }, { "epoch": 0.27276373846770957, "grad_norm": 0.03598344746987169, "learning_rate": 9.125919621273348e-06, "loss": 1.1333, "step": 170 }, { "epoch": 0.2888086642599278, "grad_norm": 0.03958997125425865, "learning_rate": 8.961067863063638e-06, "loss": 1.1382, "step": 180 }, { "epoch": 0.304853590052146, "grad_norm": 0.033584445768925664, "learning_rate": 8.783753119902766e-06, "loss": 1.1329, "step": 190 }, { "epoch": 0.32089851584436424, "grad_norm": 0.0370694089463755, "learning_rate": 8.594533289572852e-06, "loss": 1.1392, "step": 200 }, { "epoch": 0.32089851584436424, "eval_loss": 1.1237837076187134, "eval_runtime": 36.9468, "eval_samples_per_second": 5.467, "eval_steps_per_second": 1.38, "step": 200 }, { "epoch": 0.3369434416365824, "grad_norm": 0.031404112904854176, "learning_rate": 8.39400372766471e-06, "loss": 1.1338, "step": 210 }, { "epoch": 0.35298836742880063, "grad_norm": 0.03068751780314738, "learning_rate": 8.182795374368893e-06, "loss": 1.1313, "step": 220 }, { "epoch": 0.36903329322101885, "grad_norm": 0.030467037926615172, "learning_rate": 7.961572769304437e-06, "loss": 1.1247, "step": 230 }, { "epoch": 0.3850782190132371, "grad_norm": 0.030773507709064975, "learning_rate": 7.731031960631354e-06, "loss": 1.1318, "step": 240 }, { "epoch": 0.4011231448054553, "grad_norm": 0.033328095202181884, "learning_rate": 7.491898315025615e-06, "loss": 1.1249, "step": 250 }, { "epoch": 0.41716807059767347, "grad_norm": 0.03128616232047187, "learning_rate": 7.244924235407224e-06, "loss": 1.1162, "step": 260 }, { "epoch": 0.4332129963898917, "grad_norm": 0.031188168038119562, "learning_rate": 6.990886793602268e-06, "loss": 1.1133, "step": 270 }, { "epoch": 0.4492579221821099, "grad_norm": 0.03333159876932075, "learning_rate": 6.730585285387465e-06, "loss": 1.1056, "step": 280 }, { "epoch": 0.46530284797432814, "grad_norm": 0.0316593444984735, "learning_rate": 6.464838715609945e-06, "loss": 1.1266, "step": 290 }, { "epoch": 0.4813477737665463, "grad_norm": 0.03167501381379994, "learning_rate": 6.194483221294989e-06, "loss": 1.1181, "step": 300 }, { "epoch": 0.4813477737665463, "eval_loss": 1.1140353679656982, "eval_runtime": 36.7217, "eval_samples_per_second": 5.501, "eval_steps_per_second": 1.389, "step": 300 }, { "epoch": 0.49739269955876453, "grad_norm": 0.03328370476401871, "learning_rate": 5.920369440849609e-06, "loss": 1.0962, "step": 310 }, { "epoch": 0.5134376253509828, "grad_norm": 0.03447585267125308, "learning_rate": 5.643359837639419e-06, "loss": 1.1171, "step": 320 }, { "epoch": 0.529482551143201, "grad_norm": 0.03781496580664963, "learning_rate": 5.3643259863598015e-06, "loss": 1.1091, "step": 330 }, { "epoch": 0.5455274769354191, "grad_norm": 0.03172320948571133, "learning_rate": 5.084145830739462e-06, "loss": 1.1017, "step": 340 }, { "epoch": 0.5615724027276374, "grad_norm": 0.03412876166847143, "learning_rate": 4.803700921204659e-06, "loss": 1.1169, "step": 350 }, { "epoch": 0.5776173285198556, "grad_norm": 0.03375281781263732, "learning_rate": 4.5238736411954075e-06, "loss": 1.0998, "step": 360 }, { "epoch": 0.5936622543120738, "grad_norm": 0.03717435838572482, "learning_rate": 4.245544430860743e-06, "loss": 1.1117, "step": 370 }, { "epoch": 0.609707180104292, "grad_norm": 0.029559551388140908, "learning_rate": 3.969589016868269e-06, "loss": 1.1283, "step": 380 }, { "epoch": 0.6257521058965102, "grad_norm": 0.04200189123455742, "learning_rate": 3.6968756570440735e-06, "loss": 1.1228, "step": 390 }, { "epoch": 0.6417970316887285, "grad_norm": 0.03378722314455019, "learning_rate": 3.42826240851239e-06, "loss": 1.1252, "step": 400 }, { "epoch": 0.6417970316887285, "eval_loss": 1.1096988916397095, "eval_runtime": 36.792, "eval_samples_per_second": 5.49, "eval_steps_per_second": 1.386, "step": 400 }, { "epoch": 0.6578419574809466, "grad_norm": 0.03734682816128997, "learning_rate": 3.1645944279304296e-06, "loss": 1.1085, "step": 410 }, { "epoch": 0.6738868832731648, "grad_norm": 0.06074049582614517, "learning_rate": 2.906701312312861e-06, "loss": 1.0973, "step": 420 }, { "epoch": 0.6899318090653831, "grad_norm": 0.03310219959096242, "learning_rate": 2.6553944888126772e-06, "loss": 1.0965, "step": 430 }, { "epoch": 0.7059767348576013, "grad_norm": 0.036021492275449166, "learning_rate": 2.4114646616711844e-06, "loss": 1.1028, "step": 440 }, { "epoch": 0.7220216606498195, "grad_norm": 0.03370764681001696, "learning_rate": 2.175679324369913e-06, "loss": 1.1094, "step": 450 }, { "epoch": 0.7380665864420377, "grad_norm": 0.032620569117429374, "learning_rate": 1.948780344812181e-06, "loss": 1.103, "step": 460 }, { "epoch": 0.7541115122342559, "grad_norm": 0.03518623684780172, "learning_rate": 1.7314816311322219e-06, "loss": 1.1019, "step": 470 }, { "epoch": 0.7701564380264742, "grad_norm": 0.033154472751256124, "learning_rate": 1.5244668854760459e-06, "loss": 1.1025, "step": 480 }, { "epoch": 0.7862013638186923, "grad_norm": 0.03282196969531501, "learning_rate": 1.3283874528215735e-06, "loss": 1.0912, "step": 490 }, { "epoch": 0.8022462896109106, "grad_norm": 0.03261545464797978, "learning_rate": 1.143860271606333e-06, "loss": 1.1199, "step": 500 }, { "epoch": 0.8022462896109106, "eval_loss": 1.1079450845718384, "eval_runtime": 37.0687, "eval_samples_per_second": 5.449, "eval_steps_per_second": 1.376, "step": 500 }, { "epoch": 0.8182912154031288, "grad_norm": 0.03521866325178647, "learning_rate": 9.714659326109138e-07, "loss": 1.1117, "step": 510 }, { "epoch": 0.8343361411953469, "grad_norm": 0.03534230366225303, "learning_rate": 8.117468522055578e-07, "loss": 1.1197, "step": 520 }, { "epoch": 0.8503810669875652, "grad_norm": 0.03192663388371955, "learning_rate": 6.652055657075845e-07, "loss": 1.1212, "step": 530 }, { "epoch": 0.8664259927797834, "grad_norm": 0.03381094832008424, "learning_rate": 5.323031462193757e-07, "loss": 1.1147, "step": 540 }, { "epoch": 0.8824709185720016, "grad_norm": 0.0312520726762653, "learning_rate": 4.134577539217965e-07, "loss": 1.1094, "step": 550 }, { "epoch": 0.8985158443642198, "grad_norm": 0.031196657419516906, "learning_rate": 3.0904332038757977e-07, "loss": 1.1008, "step": 560 }, { "epoch": 0.914560770156438, "grad_norm": 0.03159621090668423, "learning_rate": 2.1938837205424002e-07, "loss": 1.1146, "step": 570 }, { "epoch": 0.9306056959486563, "grad_norm": 0.03478277084648844, "learning_rate": 1.4477499655837278e-07, "loss": 1.1122, "step": 580 }, { "epoch": 0.9466506217408744, "grad_norm": 0.03684852374687432, "learning_rate": 8.543795518357767e-08, "loss": 1.1185, "step": 590 }, { "epoch": 0.9626955475330926, "grad_norm": 0.03649848908454171, "learning_rate": 4.15639442146093e-08, "loss": 1.1104, "step": 600 }, { "epoch": 0.9626955475330926, "eval_loss": 1.107527494430542, "eval_runtime": 36.8485, "eval_samples_per_second": 5.482, "eval_steps_per_second": 1.384, "step": 600 }, { "epoch": 0.9787404733253109, "grad_norm": 0.032198270473962474, "learning_rate": 1.3291007521799015e-08, "loss": 1.1028, "step": 610 }, { "epoch": 0.9947853991175291, "grad_norm": 0.0365869343105355, "learning_rate": 7.081022239591173e-10, "loss": 1.1179, "step": 620 }, { "epoch": 0.9995988768551946, "step": 623, "total_flos": 6308174864842752.0, "train_loss": 1.1638640125146074, "train_runtime": 6246.6188, "train_samples_per_second": 3.192, "train_steps_per_second": 0.1 } ], "logging_steps": 10, "max_steps": 623, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6308174864842752.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }