{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 0.5752072334289551, "learning_rate": 0.0001993517017828201, "loss": 1.6434, "step": 10 }, { "epoch": 0.03, "grad_norm": 0.7842408418655396, "learning_rate": 0.0001961102106969206, "loss": 1.2847, "step": 20 }, { "epoch": 0.05, "grad_norm": 0.4593060612678528, "learning_rate": 0.0001928687196110211, "loss": 1.1227, "step": 30 }, { "epoch": 0.06, "grad_norm": 0.45610150694847107, "learning_rate": 0.00018962722852512156, "loss": 1.0359, "step": 40 }, { "epoch": 0.08, "grad_norm": 0.4743576645851135, "learning_rate": 0.00018638573743922206, "loss": 0.992, "step": 50 }, { "epoch": 0.1, "grad_norm": 0.5059588551521301, "learning_rate": 0.00018314424635332255, "loss": 0.9782, "step": 60 }, { "epoch": 0.11, "grad_norm": 0.5296839475631714, "learning_rate": 0.00017990275526742302, "loss": 0.8997, "step": 70 }, { "epoch": 0.13, "grad_norm": 0.6169719099998474, "learning_rate": 0.00017666126418152352, "loss": 0.887, "step": 80 }, { "epoch": 0.14, "grad_norm": 0.5174543261528015, "learning_rate": 0.00017341977309562402, "loss": 0.8388, "step": 90 }, { "epoch": 0.16, "grad_norm": 0.5253883600234985, "learning_rate": 0.0001701782820097245, "loss": 0.875, "step": 100 }, { "epoch": 0.18, "grad_norm": 0.5177808403968811, "learning_rate": 0.00016693679092382496, "loss": 0.8193, "step": 110 }, { "epoch": 0.19, "grad_norm": 0.50547856092453, "learning_rate": 0.00016369529983792545, "loss": 0.8444, "step": 120 }, { "epoch": 0.21, "grad_norm": 0.5066828727722168, "learning_rate": 0.00016045380875202592, "loss": 0.857, "step": 130 }, { "epoch": 0.22, "grad_norm": 0.4495781362056732, "learning_rate": 0.00015721231766612642, "loss": 0.7897, "step": 140 }, { "epoch": 0.24, "grad_norm": 0.5362279415130615, "learning_rate": 0.00015429497568881685, "loss": 0.7992, "step": 150 }, { "epoch": 0.26, "grad_norm": 0.5713534951210022, "learning_rate": 0.00015105348460291734, "loss": 0.8446, "step": 160 }, { "epoch": 0.27, "grad_norm": 0.4843427240848541, "learning_rate": 0.00014781199351701784, "loss": 0.7906, "step": 170 }, { "epoch": 0.29, "grad_norm": 0.4727051556110382, "learning_rate": 0.0001445705024311183, "loss": 0.8281, "step": 180 }, { "epoch": 0.3, "grad_norm": 0.4981895685195923, "learning_rate": 0.0001413290113452188, "loss": 0.7966, "step": 190 }, { "epoch": 0.32, "grad_norm": 0.4925459027290344, "learning_rate": 0.0001380875202593193, "loss": 0.7889, "step": 200 }, { "epoch": 0.34, "grad_norm": 0.4470859467983246, "learning_rate": 0.00013484602917341977, "loss": 0.7588, "step": 210 }, { "epoch": 0.35, "grad_norm": 0.46052438020706177, "learning_rate": 0.00013160453808752027, "loss": 0.7863, "step": 220 }, { "epoch": 0.37, "grad_norm": 0.4708721935749054, "learning_rate": 0.00012836304700162077, "loss": 0.8244, "step": 230 }, { "epoch": 0.38, "grad_norm": 0.4670282304286957, "learning_rate": 0.00012512155591572124, "loss": 0.7391, "step": 240 }, { "epoch": 0.4, "grad_norm": 0.4596056342124939, "learning_rate": 0.00012188006482982173, "loss": 0.8057, "step": 250 }, { "epoch": 0.42, "grad_norm": 0.3906649053096771, "learning_rate": 0.0001186385737439222, "loss": 0.7594, "step": 260 }, { "epoch": 0.43, "grad_norm": 0.5942690372467041, "learning_rate": 0.0001153970826580227, "loss": 0.7615, "step": 270 }, { "epoch": 0.45, "grad_norm": 0.40805208683013916, "learning_rate": 0.00011215559157212318, "loss": 0.7607, "step": 280 }, { "epoch": 0.46, "grad_norm": 0.5320433974266052, "learning_rate": 0.00010891410048622365, "loss": 0.7666, "step": 290 }, { "epoch": 0.48, "grad_norm": 0.4362870752811432, "learning_rate": 0.00010567260940032415, "loss": 0.7683, "step": 300 }, { "epoch": 0.5, "grad_norm": 0.5106446146965027, "learning_rate": 0.00010243111831442465, "loss": 0.7777, "step": 310 }, { "epoch": 0.51, "grad_norm": 0.506986677646637, "learning_rate": 9.918962722852513e-05, "loss": 0.7637, "step": 320 }, { "epoch": 0.53, "grad_norm": 0.5665115118026733, "learning_rate": 9.594813614262561e-05, "loss": 0.7497, "step": 330 }, { "epoch": 0.54, "grad_norm": 0.4510403275489807, "learning_rate": 9.27066450567261e-05, "loss": 0.7644, "step": 340 }, { "epoch": 0.56, "grad_norm": 0.5009343028068542, "learning_rate": 8.946515397082659e-05, "loss": 0.7766, "step": 350 }, { "epoch": 0.58, "grad_norm": 0.4533093273639679, "learning_rate": 8.622366288492708e-05, "loss": 0.7579, "step": 360 }, { "epoch": 0.59, "grad_norm": 0.4576455056667328, "learning_rate": 8.298217179902756e-05, "loss": 0.7731, "step": 370 }, { "epoch": 0.61, "grad_norm": 0.47114574909210205, "learning_rate": 7.974068071312804e-05, "loss": 0.7612, "step": 380 }, { "epoch": 0.62, "grad_norm": 0.5052211284637451, "learning_rate": 7.649918962722853e-05, "loss": 0.7529, "step": 390 }, { "epoch": 0.64, "grad_norm": 0.4952121675014496, "learning_rate": 7.325769854132901e-05, "loss": 0.7645, "step": 400 }, { "epoch": 0.66, "grad_norm": 0.5584261417388916, "learning_rate": 7.00162074554295e-05, "loss": 0.7859, "step": 410 }, { "epoch": 0.67, "grad_norm": 0.5271150469779968, "learning_rate": 6.677471636952999e-05, "loss": 0.7662, "step": 420 }, { "epoch": 0.69, "grad_norm": 0.6051105260848999, "learning_rate": 6.353322528363047e-05, "loss": 0.7217, "step": 430 }, { "epoch": 0.7, "grad_norm": 0.5145427584648132, "learning_rate": 6.029173419773096e-05, "loss": 0.763, "step": 440 }, { "epoch": 0.72, "grad_norm": 0.5100213885307312, "learning_rate": 5.7050243111831445e-05, "loss": 0.7493, "step": 450 }, { "epoch": 0.74, "grad_norm": 0.6406119465827942, "learning_rate": 5.380875202593193e-05, "loss": 0.7856, "step": 460 }, { "epoch": 0.75, "grad_norm": 0.5142606496810913, "learning_rate": 5.056726094003241e-05, "loss": 0.7439, "step": 470 }, { "epoch": 0.77, "grad_norm": 0.6167362332344055, "learning_rate": 4.73257698541329e-05, "loss": 0.7615, "step": 480 }, { "epoch": 0.78, "grad_norm": 0.5693132281303406, "learning_rate": 4.408427876823339e-05, "loss": 0.7535, "step": 490 }, { "epoch": 0.8, "grad_norm": 0.5026053190231323, "learning_rate": 4.0842787682333875e-05, "loss": 0.7397, "step": 500 }, { "epoch": 0.82, "grad_norm": 0.5282934904098511, "learning_rate": 3.760129659643436e-05, "loss": 0.7519, "step": 510 }, { "epoch": 0.83, "grad_norm": 0.5583475828170776, "learning_rate": 3.435980551053485e-05, "loss": 0.736, "step": 520 }, { "epoch": 0.85, "grad_norm": 0.5823280215263367, "learning_rate": 3.111831442463534e-05, "loss": 0.7383, "step": 530 }, { "epoch": 0.86, "grad_norm": 0.4686075747013092, "learning_rate": 2.7876823338735818e-05, "loss": 0.7272, "step": 540 }, { "epoch": 0.88, "grad_norm": 0.5750771164894104, "learning_rate": 2.4635332252836304e-05, "loss": 0.7483, "step": 550 }, { "epoch": 0.9, "grad_norm": 0.4452055096626282, "learning_rate": 2.1393841166936794e-05, "loss": 0.7412, "step": 560 }, { "epoch": 0.91, "grad_norm": 0.48527467250823975, "learning_rate": 1.8152350081037278e-05, "loss": 0.7287, "step": 570 }, { "epoch": 0.93, "grad_norm": 0.6023633480072021, "learning_rate": 1.4910858995137764e-05, "loss": 0.7856, "step": 580 }, { "epoch": 0.94, "grad_norm": 0.40449661016464233, "learning_rate": 1.1669367909238251e-05, "loss": 0.7191, "step": 590 }, { "epoch": 0.96, "grad_norm": 0.5225895643234253, "learning_rate": 8.427876823338736e-06, "loss": 0.7132, "step": 600 }, { "epoch": 0.98, "grad_norm": 0.4504361152648926, "learning_rate": 5.186385737439222e-06, "loss": 0.7344, "step": 610 }, { "epoch": 0.99, "grad_norm": 0.48884958028793335, "learning_rate": 1.9448946515397086e-06, "loss": 0.7086, "step": 620 }, { "epoch": 1.0, "eval_loss": 0.7055141925811768, "eval_runtime": 58.7356, "eval_samples_per_second": 1.703, "eval_steps_per_second": 0.426, "step": 625 } ], "logging_steps": 10, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 4350923021058048.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }