{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.861614129219763, "eval_steps": 50, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.035770176615247035, "grad_norm": 0.2605220377445221, "learning_rate": 1.1904761904761905e-05, "loss": 1.9177, "step": 10 }, { "epoch": 0.07154035323049407, "grad_norm": 0.29095056653022766, "learning_rate": 2.380952380952381e-05, "loss": 1.8015, "step": 20 }, { "epoch": 0.10731052984574112, "grad_norm": 0.2734232544898987, "learning_rate": 3.571428571428572e-05, "loss": 1.5329, "step": 30 }, { "epoch": 0.14308070646098814, "grad_norm": 0.3937128186225891, "learning_rate": 4.761904761904762e-05, "loss": 1.0377, "step": 40 }, { "epoch": 0.1788508830762352, "grad_norm": 0.08446656167507172, "learning_rate": 5.9523809523809524e-05, "loss": 0.3894, "step": 50 }, { "epoch": 0.1788508830762352, "eval_loss": 0.2632354497909546, "eval_runtime": 681.8071, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 50 }, { "epoch": 0.21462105969148224, "grad_norm": 0.03653671219944954, "learning_rate": 7.142857142857143e-05, "loss": 0.2474, "step": 60 }, { "epoch": 0.25039123630672927, "grad_norm": 0.04479149729013443, "learning_rate": 8.333333333333334e-05, "loss": 0.2377, "step": 70 }, { "epoch": 0.2861614129219763, "grad_norm": 0.025527365505695343, "learning_rate": 9.523809523809524e-05, "loss": 0.2238, "step": 80 }, { "epoch": 0.32193158953722334, "grad_norm": 0.02618996612727642, "learning_rate": 9.920318725099602e-05, "loss": 0.2207, "step": 90 }, { "epoch": 0.3577017661524704, "grad_norm": 0.02924993820488453, "learning_rate": 9.787516600265605e-05, "loss": 0.214, "step": 100 }, { "epoch": 0.3577017661524704, "eval_loss": 0.211310014128685, "eval_runtime": 681.6444, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 100 }, { "epoch": 0.3934719427677174, "grad_norm": 0.024481266736984253, "learning_rate": 9.654714475431607e-05, "loss": 0.2125, "step": 110 }, { "epoch": 0.42924211938296447, "grad_norm": 0.023656228557229042, "learning_rate": 9.52191235059761e-05, "loss": 0.2034, "step": 120 }, { "epoch": 0.4650122959982115, "grad_norm": 0.023205537348985672, "learning_rate": 9.389110225763613e-05, "loss": 0.2023, "step": 130 }, { "epoch": 0.5007824726134585, "grad_norm": 0.025412462651729584, "learning_rate": 9.256308100929614e-05, "loss": 0.2017, "step": 140 }, { "epoch": 0.5365526492287056, "grad_norm": 0.025770917534828186, "learning_rate": 9.123505976095618e-05, "loss": 0.1987, "step": 150 }, { "epoch": 0.5365526492287056, "eval_loss": 0.19897614419460297, "eval_runtime": 681.7987, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 150 }, { "epoch": 0.5723228258439526, "grad_norm": 0.05282260850071907, "learning_rate": 8.990703851261621e-05, "loss": 0.1981, "step": 160 }, { "epoch": 0.6080930024591996, "grad_norm": 0.03017415478825569, "learning_rate": 8.857901726427624e-05, "loss": 0.1961, "step": 170 }, { "epoch": 0.6438631790744467, "grad_norm": 0.032814837992191315, "learning_rate": 8.725099601593627e-05, "loss": 0.1972, "step": 180 }, { "epoch": 0.6796333556896937, "grad_norm": 0.03899679705500603, "learning_rate": 8.592297476759629e-05, "loss": 0.1915, "step": 190 }, { "epoch": 0.7154035323049408, "grad_norm": 0.037111785262823105, "learning_rate": 8.459495351925632e-05, "loss": 0.1917, "step": 200 }, { "epoch": 0.7154035323049408, "eval_loss": 0.18888385593891144, "eval_runtime": 681.8781, "eval_samples_per_second": 2.915, "eval_steps_per_second": 0.729, "step": 200 }, { "epoch": 0.7511737089201878, "grad_norm": 0.03370217606425285, "learning_rate": 8.326693227091635e-05, "loss": 0.1883, "step": 210 }, { "epoch": 0.7869438855354348, "grad_norm": 0.038315266370773315, "learning_rate": 8.193891102257637e-05, "loss": 0.1871, "step": 220 }, { "epoch": 0.8227140621506819, "grad_norm": 0.039963819086551666, "learning_rate": 8.061088977423639e-05, "loss": 0.1833, "step": 230 }, { "epoch": 0.8584842387659289, "grad_norm": 0.03971581161022186, "learning_rate": 7.928286852589642e-05, "loss": 0.1813, "step": 240 }, { "epoch": 0.8942544153811759, "grad_norm": 0.0481211394071579, "learning_rate": 7.795484727755644e-05, "loss": 0.1791, "step": 250 }, { "epoch": 0.8942544153811759, "eval_loss": 0.17786787450313568, "eval_runtime": 681.7787, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 250 }, { "epoch": 0.930024591996423, "grad_norm": 0.04264125972986221, "learning_rate": 7.662682602921647e-05, "loss": 0.1737, "step": 260 }, { "epoch": 0.96579476861167, "grad_norm": 0.04976845905184746, "learning_rate": 7.52988047808765e-05, "loss": 0.1749, "step": 270 }, { "epoch": 1.001564945226917, "grad_norm": 0.05254867300391197, "learning_rate": 7.397078353253653e-05, "loss": 0.1719, "step": 280 }, { "epoch": 1.037335121842164, "grad_norm": 0.055027466267347336, "learning_rate": 7.264276228419655e-05, "loss": 0.1651, "step": 290 }, { "epoch": 1.0731052984574112, "grad_norm": 0.07110296189785004, "learning_rate": 7.131474103585658e-05, "loss": 0.1612, "step": 300 }, { "epoch": 1.0731052984574112, "eval_loss": 0.16210857033729553, "eval_runtime": 680.436, "eval_samples_per_second": 2.922, "eval_steps_per_second": 0.73, "step": 300 }, { "epoch": 1.1088754750726582, "grad_norm": 0.06281454861164093, "learning_rate": 6.998671978751661e-05, "loss": 0.1575, "step": 310 }, { "epoch": 1.144645651687905, "grad_norm": 0.07971349358558655, "learning_rate": 6.865869853917662e-05, "loss": 0.1534, "step": 320 }, { "epoch": 1.1804158283031523, "grad_norm": 0.08370919525623322, "learning_rate": 6.733067729083665e-05, "loss": 0.1487, "step": 330 }, { "epoch": 1.2161860049183992, "grad_norm": 0.08752310276031494, "learning_rate": 6.600265604249668e-05, "loss": 0.1461, "step": 340 }, { "epoch": 1.2519561815336462, "grad_norm": 0.09238439798355103, "learning_rate": 6.46746347941567e-05, "loss": 0.1394, "step": 350 }, { "epoch": 1.2519561815336462, "eval_loss": 0.13786284625530243, "eval_runtime": 680.5234, "eval_samples_per_second": 2.921, "eval_steps_per_second": 0.73, "step": 350 }, { "epoch": 1.2877263581488934, "grad_norm": 0.10012154281139374, "learning_rate": 6.334661354581673e-05, "loss": 0.1359, "step": 360 }, { "epoch": 1.3234965347641405, "grad_norm": 0.11639019101858139, "learning_rate": 6.201859229747676e-05, "loss": 0.1309, "step": 370 }, { "epoch": 1.3592667113793875, "grad_norm": 0.10917984694242477, "learning_rate": 6.069057104913679e-05, "loss": 0.123, "step": 380 }, { "epoch": 1.3950368879946344, "grad_norm": 0.1265992373228073, "learning_rate": 5.936254980079682e-05, "loss": 0.1161, "step": 390 }, { "epoch": 1.4308070646098816, "grad_norm": 0.12816202640533447, "learning_rate": 5.803452855245685e-05, "loss": 0.1115, "step": 400 }, { "epoch": 1.4308070646098816, "eval_loss": 0.10932095348834991, "eval_runtime": 680.5519, "eval_samples_per_second": 2.921, "eval_steps_per_second": 0.73, "step": 400 }, { "epoch": 1.4665772412251286, "grad_norm": 0.13660216331481934, "learning_rate": 5.6706507304116875e-05, "loss": 0.105, "step": 410 }, { "epoch": 1.5023474178403755, "grad_norm": 0.14816845953464508, "learning_rate": 5.537848605577689e-05, "loss": 0.1016, "step": 420 }, { "epoch": 1.5381175944556227, "grad_norm": 0.16880381107330322, "learning_rate": 5.405046480743692e-05, "loss": 0.0949, "step": 430 }, { "epoch": 1.5738877710708696, "grad_norm": 0.15608805418014526, "learning_rate": 5.2722443559096944e-05, "loss": 0.09, "step": 440 }, { "epoch": 1.6096579476861166, "grad_norm": 0.15114428102970123, "learning_rate": 5.139442231075697e-05, "loss": 0.0852, "step": 450 }, { "epoch": 1.6096579476861166, "eval_loss": 0.08125962316989899, "eval_runtime": 680.5355, "eval_samples_per_second": 2.921, "eval_steps_per_second": 0.73, "step": 450 }, { "epoch": 1.6454281243013638, "grad_norm": 0.1671249121427536, "learning_rate": 5.0066401062417e-05, "loss": 0.0793, "step": 460 }, { "epoch": 1.681198300916611, "grad_norm": 0.16212376952171326, "learning_rate": 4.8738379814077026e-05, "loss": 0.0745, "step": 470 }, { "epoch": 1.7169684775318577, "grad_norm": 0.1631159633398056, "learning_rate": 4.7410358565737054e-05, "loss": 0.0699, "step": 480 }, { "epoch": 1.7527386541471048, "grad_norm": 0.17926354706287384, "learning_rate": 4.608233731739708e-05, "loss": 0.0621, "step": 490 }, { "epoch": 1.788508830762352, "grad_norm": 0.1659461259841919, "learning_rate": 4.475431606905711e-05, "loss": 0.06, "step": 500 }, { "epoch": 1.788508830762352, "eval_loss": 0.058878425508737564, "eval_runtime": 680.5096, "eval_samples_per_second": 2.921, "eval_steps_per_second": 0.73, "step": 500 }, { "epoch": 1.824279007377599, "grad_norm": 0.1780182421207428, "learning_rate": 4.3426294820717136e-05, "loss": 0.0564, "step": 510 }, { "epoch": 1.860049183992846, "grad_norm": 0.17848838865756989, "learning_rate": 4.2098273572377163e-05, "loss": 0.0522, "step": 520 }, { "epoch": 1.895819360608093, "grad_norm": 0.17067597806453705, "learning_rate": 4.077025232403719e-05, "loss": 0.0505, "step": 530 }, { "epoch": 1.93158953722334, "grad_norm": 0.16824504733085632, "learning_rate": 3.944223107569721e-05, "loss": 0.0464, "step": 540 }, { "epoch": 1.967359713838587, "grad_norm": 0.17104943096637726, "learning_rate": 3.811420982735724e-05, "loss": 0.043, "step": 550 }, { "epoch": 1.967359713838587, "eval_loss": 0.04172534495592117, "eval_runtime": 680.4722, "eval_samples_per_second": 2.922, "eval_steps_per_second": 0.73, "step": 550 }, { "epoch": 2.003129890453834, "grad_norm": 0.1792486160993576, "learning_rate": 3.6786188579017266e-05, "loss": 0.0399, "step": 560 }, { "epoch": 2.0389000670690813, "grad_norm": 0.13952085375785828, "learning_rate": 3.5458167330677294e-05, "loss": 0.0327, "step": 570 }, { "epoch": 2.074670243684328, "grad_norm": 0.1532105654478073, "learning_rate": 3.4130146082337314e-05, "loss": 0.0308, "step": 580 }, { "epoch": 2.1104404202995752, "grad_norm": 0.16048988699913025, "learning_rate": 3.280212483399734e-05, "loss": 0.0295, "step": 590 }, { "epoch": 2.1462105969148224, "grad_norm": 0.13789258897304535, "learning_rate": 3.147410358565737e-05, "loss": 0.0294, "step": 600 }, { "epoch": 2.1462105969148224, "eval_loss": 0.030796948820352554, "eval_runtime": 681.0206, "eval_samples_per_second": 2.919, "eval_steps_per_second": 0.73, "step": 600 }, { "epoch": 2.181980773530069, "grad_norm": 0.1437063366174698, "learning_rate": 3.01460823373174e-05, "loss": 0.0273, "step": 610 }, { "epoch": 2.2177509501453163, "grad_norm": 0.14592672884464264, "learning_rate": 2.8818061088977427e-05, "loss": 0.0255, "step": 620 }, { "epoch": 2.2535211267605635, "grad_norm": 0.14989186823368073, "learning_rate": 2.7490039840637448e-05, "loss": 0.0238, "step": 630 }, { "epoch": 2.28929130337581, "grad_norm": 0.13284747302532196, "learning_rate": 2.6162018592297476e-05, "loss": 0.0227, "step": 640 }, { "epoch": 2.3250614799910574, "grad_norm": 0.13112632930278778, "learning_rate": 2.4833997343957506e-05, "loss": 0.0222, "step": 650 }, { "epoch": 2.3250614799910574, "eval_loss": 0.023032352328300476, "eval_runtime": 681.8193, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 650 }, { "epoch": 2.3608316566063046, "grad_norm": 0.12297944724559784, "learning_rate": 2.3505976095617534e-05, "loss": 0.0206, "step": 660 }, { "epoch": 2.3966018332215517, "grad_norm": 0.13545510172843933, "learning_rate": 2.2177954847277558e-05, "loss": 0.0197, "step": 670 }, { "epoch": 2.4323720098367985, "grad_norm": 0.11164336651563644, "learning_rate": 2.0849933598937585e-05, "loss": 0.0192, "step": 680 }, { "epoch": 2.4681421864520456, "grad_norm": 0.11971808969974518, "learning_rate": 1.952191235059761e-05, "loss": 0.0187, "step": 690 }, { "epoch": 2.5039123630672924, "grad_norm": 0.08939900994300842, "learning_rate": 1.8193891102257637e-05, "loss": 0.0177, "step": 700 }, { "epoch": 2.5039123630672924, "eval_loss": 0.018428906798362732, "eval_runtime": 681.7832, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 700 }, { "epoch": 2.5396825396825395, "grad_norm": 0.09480512142181396, "learning_rate": 1.6865869853917664e-05, "loss": 0.017, "step": 710 }, { "epoch": 2.5754527162977867, "grad_norm": 0.11313159018754959, "learning_rate": 1.553784860557769e-05, "loss": 0.0164, "step": 720 }, { "epoch": 2.611222892913034, "grad_norm": 0.10267580300569534, "learning_rate": 1.4209827357237717e-05, "loss": 0.0161, "step": 730 }, { "epoch": 2.646993069528281, "grad_norm": 0.11123546212911606, "learning_rate": 1.2881806108897743e-05, "loss": 0.0154, "step": 740 }, { "epoch": 2.682763246143528, "grad_norm": 0.09317459166049957, "learning_rate": 1.1553784860557769e-05, "loss": 0.0152, "step": 750 }, { "epoch": 2.682763246143528, "eval_loss": 0.015589827671647072, "eval_runtime": 681.7975, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 750 }, { "epoch": 2.718533422758775, "grad_norm": 0.0923512727022171, "learning_rate": 1.0225763612217796e-05, "loss": 0.015, "step": 760 }, { "epoch": 2.7543035993740217, "grad_norm": 0.09502092003822327, "learning_rate": 8.897742363877823e-06, "loss": 0.0143, "step": 770 }, { "epoch": 2.790073775989269, "grad_norm": 0.09042944759130478, "learning_rate": 7.569721115537849e-06, "loss": 0.0143, "step": 780 }, { "epoch": 2.825843952604516, "grad_norm": 0.08655345439910889, "learning_rate": 6.241699867197876e-06, "loss": 0.0139, "step": 790 }, { "epoch": 2.861614129219763, "grad_norm": 0.07545724511146545, "learning_rate": 4.9136786188579014e-06, "loss": 0.0137, "step": 800 }, { "epoch": 2.861614129219763, "eval_loss": 0.014045392163097858, "eval_runtime": 681.381, "eval_samples_per_second": 2.918, "eval_steps_per_second": 0.729, "step": 800 } ], "logging_steps": 10, "max_steps": 837, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.194702349626769e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }