{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.1462105969148224, "eval_steps": 50, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.035770176615247035, "grad_norm": 0.2605220377445221, "learning_rate": 1.1904761904761905e-05, "loss": 1.9177, "step": 10 }, { "epoch": 0.07154035323049407, "grad_norm": 0.29095056653022766, "learning_rate": 2.380952380952381e-05, "loss": 1.8015, "step": 20 }, { "epoch": 0.10731052984574112, "grad_norm": 0.2734232544898987, "learning_rate": 3.571428571428572e-05, "loss": 1.5329, "step": 30 }, { "epoch": 0.14308070646098814, "grad_norm": 0.3937128186225891, "learning_rate": 4.761904761904762e-05, "loss": 1.0377, "step": 40 }, { "epoch": 0.1788508830762352, "grad_norm": 0.08446656167507172, "learning_rate": 5.9523809523809524e-05, "loss": 0.3894, "step": 50 }, { "epoch": 0.1788508830762352, "eval_loss": 0.2632354497909546, "eval_runtime": 681.8071, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 50 }, { "epoch": 0.21462105969148224, "grad_norm": 0.03653671219944954, "learning_rate": 7.142857142857143e-05, "loss": 0.2474, "step": 60 }, { "epoch": 0.25039123630672927, "grad_norm": 0.04479149729013443, "learning_rate": 8.333333333333334e-05, "loss": 0.2377, "step": 70 }, { "epoch": 0.2861614129219763, "grad_norm": 0.025527365505695343, "learning_rate": 9.523809523809524e-05, "loss": 0.2238, "step": 80 }, { "epoch": 0.32193158953722334, "grad_norm": 0.02618996612727642, "learning_rate": 9.920318725099602e-05, "loss": 0.2207, "step": 90 }, { "epoch": 0.3577017661524704, "grad_norm": 0.02924993820488453, "learning_rate": 9.787516600265605e-05, "loss": 0.214, "step": 100 }, { "epoch": 0.3577017661524704, "eval_loss": 0.211310014128685, "eval_runtime": 681.6444, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 100 }, { "epoch": 0.3934719427677174, "grad_norm": 0.024481266736984253, "learning_rate": 9.654714475431607e-05, "loss": 0.2125, "step": 110 }, { "epoch": 0.42924211938296447, "grad_norm": 0.023656228557229042, "learning_rate": 9.52191235059761e-05, "loss": 0.2034, "step": 120 }, { "epoch": 0.4650122959982115, "grad_norm": 0.023205537348985672, "learning_rate": 9.389110225763613e-05, "loss": 0.2023, "step": 130 }, { "epoch": 0.5007824726134585, "grad_norm": 0.025412462651729584, "learning_rate": 9.256308100929614e-05, "loss": 0.2017, "step": 140 }, { "epoch": 0.5365526492287056, "grad_norm": 0.025770917534828186, "learning_rate": 9.123505976095618e-05, "loss": 0.1987, "step": 150 }, { "epoch": 0.5365526492287056, "eval_loss": 0.19897614419460297, "eval_runtime": 681.7987, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 150 }, { "epoch": 0.5723228258439526, "grad_norm": 0.05282260850071907, "learning_rate": 8.990703851261621e-05, "loss": 0.1981, "step": 160 }, { "epoch": 0.6080930024591996, "grad_norm": 0.03017415478825569, "learning_rate": 8.857901726427624e-05, "loss": 0.1961, "step": 170 }, { "epoch": 0.6438631790744467, "grad_norm": 0.032814837992191315, "learning_rate": 8.725099601593627e-05, "loss": 0.1972, "step": 180 }, { "epoch": 0.6796333556896937, "grad_norm": 0.03899679705500603, "learning_rate": 8.592297476759629e-05, "loss": 0.1915, "step": 190 }, { "epoch": 0.7154035323049408, "grad_norm": 0.037111785262823105, "learning_rate": 8.459495351925632e-05, "loss": 0.1917, "step": 200 }, { "epoch": 0.7154035323049408, "eval_loss": 0.18888385593891144, "eval_runtime": 681.8781, "eval_samples_per_second": 2.915, "eval_steps_per_second": 0.729, "step": 200 }, { "epoch": 0.7511737089201878, "grad_norm": 0.03370217606425285, "learning_rate": 8.326693227091635e-05, "loss": 0.1883, "step": 210 }, { "epoch": 0.7869438855354348, "grad_norm": 0.038315266370773315, "learning_rate": 8.193891102257637e-05, "loss": 0.1871, "step": 220 }, { "epoch": 0.8227140621506819, "grad_norm": 0.039963819086551666, "learning_rate": 8.061088977423639e-05, "loss": 0.1833, "step": 230 }, { "epoch": 0.8584842387659289, "grad_norm": 0.03971581161022186, "learning_rate": 7.928286852589642e-05, "loss": 0.1813, "step": 240 }, { "epoch": 0.8942544153811759, "grad_norm": 0.0481211394071579, "learning_rate": 7.795484727755644e-05, "loss": 0.1791, "step": 250 }, { "epoch": 0.8942544153811759, "eval_loss": 0.17786787450313568, "eval_runtime": 681.7787, "eval_samples_per_second": 2.916, "eval_steps_per_second": 0.729, "step": 250 }, { "epoch": 0.930024591996423, "grad_norm": 0.04264125972986221, "learning_rate": 7.662682602921647e-05, "loss": 0.1737, "step": 260 }, { "epoch": 0.96579476861167, "grad_norm": 0.04976845905184746, "learning_rate": 7.52988047808765e-05, "loss": 0.1749, "step": 270 }, { "epoch": 1.001564945226917, "grad_norm": 0.05254867300391197, "learning_rate": 7.397078353253653e-05, "loss": 0.1719, "step": 280 }, { "epoch": 1.037335121842164, "grad_norm": 0.055027466267347336, "learning_rate": 7.264276228419655e-05, "loss": 0.1651, "step": 290 }, { "epoch": 1.0731052984574112, "grad_norm": 0.07110296189785004, "learning_rate": 7.131474103585658e-05, "loss": 0.1612, "step": 300 }, { "epoch": 1.0731052984574112, "eval_loss": 0.16210857033729553, "eval_runtime": 680.436, "eval_samples_per_second": 2.922, "eval_steps_per_second": 0.73, "step": 300 }, { "epoch": 1.1088754750726582, "grad_norm": 0.06281454861164093, "learning_rate": 6.998671978751661e-05, "loss": 0.1575, "step": 310 }, { "epoch": 1.144645651687905, "grad_norm": 0.07971349358558655, "learning_rate": 6.865869853917662e-05, "loss": 0.1534, "step": 320 }, { "epoch": 1.1804158283031523, "grad_norm": 0.08370919525623322, "learning_rate": 6.733067729083665e-05, "loss": 0.1487, "step": 330 }, { "epoch": 1.2161860049183992, "grad_norm": 0.08752310276031494, "learning_rate": 6.600265604249668e-05, "loss": 0.1461, "step": 340 }, { "epoch": 1.2519561815336462, "grad_norm": 0.09238439798355103, "learning_rate": 6.46746347941567e-05, "loss": 0.1394, "step": 350 }, { "epoch": 1.2519561815336462, "eval_loss": 0.13786284625530243, "eval_runtime": 680.5234, "eval_samples_per_second": 2.921, "eval_steps_per_second": 0.73, "step": 350 }, { "epoch": 1.2877263581488934, "grad_norm": 0.10012154281139374, "learning_rate": 6.334661354581673e-05, "loss": 0.1359, "step": 360 }, { "epoch": 1.3234965347641405, "grad_norm": 0.11639019101858139, "learning_rate": 6.201859229747676e-05, "loss": 0.1309, "step": 370 }, { "epoch": 1.3592667113793875, "grad_norm": 0.10917984694242477, "learning_rate": 6.069057104913679e-05, "loss": 0.123, "step": 380 }, { "epoch": 1.3950368879946344, "grad_norm": 0.1265992373228073, "learning_rate": 5.936254980079682e-05, "loss": 0.1161, "step": 390 }, { "epoch": 1.4308070646098816, "grad_norm": 0.12816202640533447, "learning_rate": 5.803452855245685e-05, "loss": 0.1115, "step": 400 }, { "epoch": 1.4308070646098816, "eval_loss": 0.10932095348834991, "eval_runtime": 680.5519, "eval_samples_per_second": 2.921, "eval_steps_per_second": 0.73, "step": 400 }, { "epoch": 1.4665772412251286, "grad_norm": 0.13660216331481934, "learning_rate": 5.6706507304116875e-05, "loss": 0.105, "step": 410 }, { "epoch": 1.5023474178403755, "grad_norm": 0.14816845953464508, "learning_rate": 5.537848605577689e-05, "loss": 0.1016, "step": 420 }, { "epoch": 1.5381175944556227, "grad_norm": 0.16880381107330322, "learning_rate": 5.405046480743692e-05, "loss": 0.0949, "step": 430 }, { "epoch": 1.5738877710708696, "grad_norm": 0.15608805418014526, "learning_rate": 5.2722443559096944e-05, "loss": 0.09, "step": 440 }, { "epoch": 1.6096579476861166, "grad_norm": 0.15114428102970123, "learning_rate": 5.139442231075697e-05, "loss": 0.0852, "step": 450 }, { "epoch": 1.6096579476861166, "eval_loss": 0.08125962316989899, "eval_runtime": 680.5355, "eval_samples_per_second": 2.921, "eval_steps_per_second": 0.73, "step": 450 }, { "epoch": 1.6454281243013638, "grad_norm": 0.1671249121427536, "learning_rate": 5.0066401062417e-05, "loss": 0.0793, "step": 460 }, { "epoch": 1.681198300916611, "grad_norm": 0.16212376952171326, "learning_rate": 4.8738379814077026e-05, "loss": 0.0745, "step": 470 }, { "epoch": 1.7169684775318577, "grad_norm": 0.1631159633398056, "learning_rate": 4.7410358565737054e-05, "loss": 0.0699, "step": 480 }, { "epoch": 1.7527386541471048, "grad_norm": 0.17926354706287384, "learning_rate": 4.608233731739708e-05, "loss": 0.0621, "step": 490 }, { "epoch": 1.788508830762352, "grad_norm": 0.1659461259841919, "learning_rate": 4.475431606905711e-05, "loss": 0.06, "step": 500 }, { "epoch": 1.788508830762352, "eval_loss": 0.058878425508737564, "eval_runtime": 680.5096, "eval_samples_per_second": 2.921, "eval_steps_per_second": 0.73, "step": 500 }, { "epoch": 1.824279007377599, "grad_norm": 0.1780182421207428, "learning_rate": 4.3426294820717136e-05, "loss": 0.0564, "step": 510 }, { "epoch": 1.860049183992846, "grad_norm": 0.17848838865756989, "learning_rate": 4.2098273572377163e-05, "loss": 0.0522, "step": 520 }, { "epoch": 1.895819360608093, "grad_norm": 0.17067597806453705, "learning_rate": 4.077025232403719e-05, "loss": 0.0505, "step": 530 }, { "epoch": 1.93158953722334, "grad_norm": 0.16824504733085632, "learning_rate": 3.944223107569721e-05, "loss": 0.0464, "step": 540 }, { "epoch": 1.967359713838587, "grad_norm": 0.17104943096637726, "learning_rate": 3.811420982735724e-05, "loss": 0.043, "step": 550 }, { "epoch": 1.967359713838587, "eval_loss": 0.04172534495592117, "eval_runtime": 680.4722, "eval_samples_per_second": 2.922, "eval_steps_per_second": 0.73, "step": 550 }, { "epoch": 2.003129890453834, "grad_norm": 0.1792486160993576, "learning_rate": 3.6786188579017266e-05, "loss": 0.0399, "step": 560 }, { "epoch": 2.0389000670690813, "grad_norm": 0.13952085375785828, "learning_rate": 3.5458167330677294e-05, "loss": 0.0327, "step": 570 }, { "epoch": 2.074670243684328, "grad_norm": 0.1532105654478073, "learning_rate": 3.4130146082337314e-05, "loss": 0.0308, "step": 580 }, { "epoch": 2.1104404202995752, "grad_norm": 0.16048988699913025, "learning_rate": 3.280212483399734e-05, "loss": 0.0295, "step": 590 }, { "epoch": 2.1462105969148224, "grad_norm": 0.13789258897304535, "learning_rate": 3.147410358565737e-05, "loss": 0.0294, "step": 600 }, { "epoch": 2.1462105969148224, "eval_loss": 0.030796948820352554, "eval_runtime": 681.0206, "eval_samples_per_second": 2.919, "eval_steps_per_second": 0.73, "step": 600 } ], "logging_steps": 10, "max_steps": 837, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.1458905533452386e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }