{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1179, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02544529262086514, "grad_norm": 3.0892276763916016, "learning_rate": 5e-06, "loss": 0.8937, "step": 10 }, { "epoch": 0.05089058524173028, "grad_norm": 1.0776044130325317, "learning_rate": 5e-06, "loss": 0.7655, "step": 20 }, { "epoch": 0.07633587786259542, "grad_norm": 2.4046175479888916, "learning_rate": 5e-06, "loss": 0.7156, "step": 30 }, { "epoch": 0.10178117048346055, "grad_norm": 1.0319740772247314, "learning_rate": 5e-06, "loss": 0.6884, "step": 40 }, { "epoch": 0.1272264631043257, "grad_norm": 0.8365700840950012, "learning_rate": 5e-06, "loss": 0.6666, "step": 50 }, { "epoch": 0.15267175572519084, "grad_norm": 1.0443907976150513, "learning_rate": 5e-06, "loss": 0.6517, "step": 60 }, { "epoch": 0.178117048346056, "grad_norm": 0.6737371683120728, "learning_rate": 5e-06, "loss": 0.6476, "step": 70 }, { "epoch": 0.2035623409669211, "grad_norm": 0.7133864760398865, "learning_rate": 5e-06, "loss": 0.6313, "step": 80 }, { "epoch": 0.22900763358778625, "grad_norm": 0.873466432094574, "learning_rate": 5e-06, "loss": 0.6223, "step": 90 }, { "epoch": 0.2544529262086514, "grad_norm": 0.8450167179107666, "learning_rate": 5e-06, "loss": 0.6252, "step": 100 }, { "epoch": 0.27989821882951654, "grad_norm": 0.5868242383003235, "learning_rate": 5e-06, "loss": 0.6153, "step": 110 }, { "epoch": 0.3053435114503817, "grad_norm": 0.6943073868751526, "learning_rate": 5e-06, "loss": 0.6155, "step": 120 }, { "epoch": 0.33078880407124683, "grad_norm": 0.568712592124939, "learning_rate": 5e-06, "loss": 0.6086, "step": 130 }, { "epoch": 0.356234096692112, "grad_norm": 0.5304349660873413, "learning_rate": 5e-06, "loss": 0.6046, "step": 140 }, { "epoch": 0.3816793893129771, "grad_norm": 0.5914269089698792, "learning_rate": 5e-06, "loss": 0.6014, "step": 150 }, { "epoch": 0.4071246819338422, "grad_norm": 0.6255616545677185, "learning_rate": 5e-06, "loss": 0.6059, "step": 160 }, { "epoch": 0.43256997455470736, "grad_norm": 0.6703746318817139, "learning_rate": 5e-06, "loss": 0.595, "step": 170 }, { "epoch": 0.4580152671755725, "grad_norm": 0.6239282488822937, "learning_rate": 5e-06, "loss": 0.5961, "step": 180 }, { "epoch": 0.48346055979643765, "grad_norm": 0.5409078598022461, "learning_rate": 5e-06, "loss": 0.5936, "step": 190 }, { "epoch": 0.5089058524173028, "grad_norm": 0.47156909108161926, "learning_rate": 5e-06, "loss": 0.5871, "step": 200 }, { "epoch": 0.5343511450381679, "grad_norm": 0.6108508110046387, "learning_rate": 5e-06, "loss": 0.5936, "step": 210 }, { "epoch": 0.5597964376590331, "grad_norm": 0.510125994682312, "learning_rate": 5e-06, "loss": 0.5804, "step": 220 }, { "epoch": 0.5852417302798982, "grad_norm": 0.532505214214325, "learning_rate": 5e-06, "loss": 0.583, "step": 230 }, { "epoch": 0.6106870229007634, "grad_norm": 0.732284426689148, "learning_rate": 5e-06, "loss": 0.5817, "step": 240 }, { "epoch": 0.6361323155216285, "grad_norm": 0.624311089515686, "learning_rate": 5e-06, "loss": 0.5821, "step": 250 }, { "epoch": 0.6615776081424937, "grad_norm": 0.6264485120773315, "learning_rate": 5e-06, "loss": 0.5805, "step": 260 }, { "epoch": 0.6870229007633588, "grad_norm": 0.5334043502807617, "learning_rate": 5e-06, "loss": 0.5818, "step": 270 }, { "epoch": 0.712468193384224, "grad_norm": 0.5711916089057922, "learning_rate": 5e-06, "loss": 0.5768, "step": 280 }, { "epoch": 0.7379134860050891, "grad_norm": 0.4914567172527313, "learning_rate": 5e-06, "loss": 0.5733, "step": 290 }, { "epoch": 0.7633587786259542, "grad_norm": 0.4886472523212433, "learning_rate": 5e-06, "loss": 0.5827, "step": 300 }, { "epoch": 0.7888040712468194, "grad_norm": 0.5745688080787659, "learning_rate": 5e-06, "loss": 0.5756, "step": 310 }, { "epoch": 0.8142493638676844, "grad_norm": 0.6722710132598877, "learning_rate": 5e-06, "loss": 0.5722, "step": 320 }, { "epoch": 0.8396946564885496, "grad_norm": 0.5262378454208374, "learning_rate": 5e-06, "loss": 0.5704, "step": 330 }, { "epoch": 0.8651399491094147, "grad_norm": 0.5720959305763245, "learning_rate": 5e-06, "loss": 0.5739, "step": 340 }, { "epoch": 0.8905852417302799, "grad_norm": 0.525618314743042, "learning_rate": 5e-06, "loss": 0.5648, "step": 350 }, { "epoch": 0.916030534351145, "grad_norm": 0.5551468133926392, "learning_rate": 5e-06, "loss": 0.5726, "step": 360 }, { "epoch": 0.9414758269720102, "grad_norm": 0.5689135193824768, "learning_rate": 5e-06, "loss": 0.5644, "step": 370 }, { "epoch": 0.9669211195928753, "grad_norm": 0.5880911946296692, "learning_rate": 5e-06, "loss": 0.5628, "step": 380 }, { "epoch": 0.9923664122137404, "grad_norm": 0.5270939469337463, "learning_rate": 5e-06, "loss": 0.5649, "step": 390 }, { "epoch": 1.0, "eval_loss": 0.5690909624099731, "eval_runtime": 35.8687, "eval_samples_per_second": 294.602, "eval_steps_per_second": 1.171, "step": 393 }, { "epoch": 1.0178117048346056, "grad_norm": 0.6128649115562439, "learning_rate": 5e-06, "loss": 0.5446, "step": 400 }, { "epoch": 1.0432569974554706, "grad_norm": 0.6446624398231506, "learning_rate": 5e-06, "loss": 0.5287, "step": 410 }, { "epoch": 1.0687022900763359, "grad_norm": 0.6626042127609253, "learning_rate": 5e-06, "loss": 0.528, "step": 420 }, { "epoch": 1.094147582697201, "grad_norm": 0.6851757168769836, "learning_rate": 5e-06, "loss": 0.5275, "step": 430 }, { "epoch": 1.1195928753180662, "grad_norm": 0.5607665181159973, "learning_rate": 5e-06, "loss": 0.5317, "step": 440 }, { "epoch": 1.1450381679389312, "grad_norm": 0.5421668887138367, "learning_rate": 5e-06, "loss": 0.5279, "step": 450 }, { "epoch": 1.1704834605597965, "grad_norm": 0.5087480545043945, "learning_rate": 5e-06, "loss": 0.5323, "step": 460 }, { "epoch": 1.1959287531806615, "grad_norm": 0.6041454672813416, "learning_rate": 5e-06, "loss": 0.5221, "step": 470 }, { "epoch": 1.2213740458015268, "grad_norm": 0.5389483571052551, "learning_rate": 5e-06, "loss": 0.5296, "step": 480 }, { "epoch": 1.2468193384223918, "grad_norm": 0.5498439073562622, "learning_rate": 5e-06, "loss": 0.5236, "step": 490 }, { "epoch": 1.272264631043257, "grad_norm": 0.534608781337738, "learning_rate": 5e-06, "loss": 0.5274, "step": 500 }, { "epoch": 1.297709923664122, "grad_norm": 0.5173637866973877, "learning_rate": 5e-06, "loss": 0.5234, "step": 510 }, { "epoch": 1.3231552162849873, "grad_norm": 0.640381932258606, "learning_rate": 5e-06, "loss": 0.5191, "step": 520 }, { "epoch": 1.3486005089058524, "grad_norm": 0.636634111404419, "learning_rate": 5e-06, "loss": 0.5274, "step": 530 }, { "epoch": 1.3740458015267176, "grad_norm": 0.4825289249420166, "learning_rate": 5e-06, "loss": 0.5271, "step": 540 }, { "epoch": 1.3994910941475827, "grad_norm": 0.7268803715705872, "learning_rate": 5e-06, "loss": 0.5239, "step": 550 }, { "epoch": 1.424936386768448, "grad_norm": 0.5402505993843079, "learning_rate": 5e-06, "loss": 0.5212, "step": 560 }, { "epoch": 1.450381679389313, "grad_norm": 0.5150734186172485, "learning_rate": 5e-06, "loss": 0.5147, "step": 570 }, { "epoch": 1.4758269720101782, "grad_norm": 0.5088803768157959, "learning_rate": 5e-06, "loss": 0.5238, "step": 580 }, { "epoch": 1.5012722646310432, "grad_norm": 0.5912160277366638, "learning_rate": 5e-06, "loss": 0.5226, "step": 590 }, { "epoch": 1.5267175572519083, "grad_norm": 0.5235996842384338, "learning_rate": 5e-06, "loss": 0.5161, "step": 600 }, { "epoch": 1.5521628498727735, "grad_norm": 0.6075550317764282, "learning_rate": 5e-06, "loss": 0.5157, "step": 610 }, { "epoch": 1.5776081424936388, "grad_norm": 0.5467493534088135, "learning_rate": 5e-06, "loss": 0.5173, "step": 620 }, { "epoch": 1.6030534351145038, "grad_norm": 0.5767125487327576, "learning_rate": 5e-06, "loss": 0.518, "step": 630 }, { "epoch": 1.6284987277353689, "grad_norm": 0.6012499332427979, "learning_rate": 5e-06, "loss": 0.5136, "step": 640 }, { "epoch": 1.6539440203562341, "grad_norm": 0.5314013361930847, "learning_rate": 5e-06, "loss": 0.5175, "step": 650 }, { "epoch": 1.6793893129770994, "grad_norm": 0.6663753390312195, "learning_rate": 5e-06, "loss": 0.5211, "step": 660 }, { "epoch": 1.7048346055979644, "grad_norm": 0.6120356917381287, "learning_rate": 5e-06, "loss": 0.5228, "step": 670 }, { "epoch": 1.7302798982188294, "grad_norm": 0.6509705781936646, "learning_rate": 5e-06, "loss": 0.5177, "step": 680 }, { "epoch": 1.7557251908396947, "grad_norm": 0.5193184614181519, "learning_rate": 5e-06, "loss": 0.5147, "step": 690 }, { "epoch": 1.78117048346056, "grad_norm": 0.5160722136497498, "learning_rate": 5e-06, "loss": 0.5167, "step": 700 }, { "epoch": 1.806615776081425, "grad_norm": 0.6566229462623596, "learning_rate": 5e-06, "loss": 0.5116, "step": 710 }, { "epoch": 1.83206106870229, "grad_norm": 0.4844520688056946, "learning_rate": 5e-06, "loss": 0.5125, "step": 720 }, { "epoch": 1.8575063613231553, "grad_norm": 0.5609481334686279, "learning_rate": 5e-06, "loss": 0.5134, "step": 730 }, { "epoch": 1.8829516539440203, "grad_norm": 0.4838588535785675, "learning_rate": 5e-06, "loss": 0.5111, "step": 740 }, { "epoch": 1.9083969465648853, "grad_norm": 0.5089361667633057, "learning_rate": 5e-06, "loss": 0.5147, "step": 750 }, { "epoch": 1.9338422391857506, "grad_norm": 0.4935343861579895, "learning_rate": 5e-06, "loss": 0.5092, "step": 760 }, { "epoch": 1.9592875318066159, "grad_norm": 0.5261263251304626, "learning_rate": 5e-06, "loss": 0.5092, "step": 770 }, { "epoch": 1.984732824427481, "grad_norm": 0.5327720642089844, "learning_rate": 5e-06, "loss": 0.5093, "step": 780 }, { "epoch": 2.0, "eval_loss": 0.5446056127548218, "eval_runtime": 35.403, "eval_samples_per_second": 298.478, "eval_steps_per_second": 1.186, "step": 786 }, { "epoch": 2.010178117048346, "grad_norm": 0.7140676975250244, "learning_rate": 5e-06, "loss": 0.4964, "step": 790 }, { "epoch": 2.035623409669211, "grad_norm": 0.5526481866836548, "learning_rate": 5e-06, "loss": 0.4699, "step": 800 }, { "epoch": 2.0610687022900764, "grad_norm": 0.8097620010375977, "learning_rate": 5e-06, "loss": 0.4747, "step": 810 }, { "epoch": 2.0865139949109412, "grad_norm": 0.5774556994438171, "learning_rate": 5e-06, "loss": 0.4731, "step": 820 }, { "epoch": 2.1119592875318065, "grad_norm": 0.7799936532974243, "learning_rate": 5e-06, "loss": 0.481, "step": 830 }, { "epoch": 2.1374045801526718, "grad_norm": 0.7324852347373962, "learning_rate": 5e-06, "loss": 0.472, "step": 840 }, { "epoch": 2.162849872773537, "grad_norm": 0.6810862421989441, "learning_rate": 5e-06, "loss": 0.4788, "step": 850 }, { "epoch": 2.188295165394402, "grad_norm": 0.6535518169403076, "learning_rate": 5e-06, "loss": 0.4787, "step": 860 }, { "epoch": 2.213740458015267, "grad_norm": 0.5570102334022522, "learning_rate": 5e-06, "loss": 0.4785, "step": 870 }, { "epoch": 2.2391857506361323, "grad_norm": 0.5471156239509583, "learning_rate": 5e-06, "loss": 0.4817, "step": 880 }, { "epoch": 2.2646310432569976, "grad_norm": 0.5434932112693787, "learning_rate": 5e-06, "loss": 0.4732, "step": 890 }, { "epoch": 2.2900763358778624, "grad_norm": 0.5234562754631042, "learning_rate": 5e-06, "loss": 0.4742, "step": 900 }, { "epoch": 2.3155216284987277, "grad_norm": 0.5137863159179688, "learning_rate": 5e-06, "loss": 0.4725, "step": 910 }, { "epoch": 2.340966921119593, "grad_norm": 0.5251743197441101, "learning_rate": 5e-06, "loss": 0.4793, "step": 920 }, { "epoch": 2.366412213740458, "grad_norm": 0.5063709020614624, "learning_rate": 5e-06, "loss": 0.4735, "step": 930 }, { "epoch": 2.391857506361323, "grad_norm": 0.5757549405097961, "learning_rate": 5e-06, "loss": 0.4728, "step": 940 }, { "epoch": 2.4173027989821882, "grad_norm": 0.48364320397377014, "learning_rate": 5e-06, "loss": 0.4785, "step": 950 }, { "epoch": 2.4427480916030535, "grad_norm": 0.5600482225418091, "learning_rate": 5e-06, "loss": 0.4748, "step": 960 }, { "epoch": 2.4681933842239188, "grad_norm": 0.5102260708808899, "learning_rate": 5e-06, "loss": 0.4752, "step": 970 }, { "epoch": 2.4936386768447836, "grad_norm": 0.642202615737915, "learning_rate": 5e-06, "loss": 0.4743, "step": 980 }, { "epoch": 2.519083969465649, "grad_norm": 0.7704691886901855, "learning_rate": 5e-06, "loss": 0.4743, "step": 990 }, { "epoch": 2.544529262086514, "grad_norm": 0.6885623931884766, "learning_rate": 5e-06, "loss": 0.4742, "step": 1000 }, { "epoch": 2.569974554707379, "grad_norm": 0.5846525430679321, "learning_rate": 5e-06, "loss": 0.4734, "step": 1010 }, { "epoch": 2.595419847328244, "grad_norm": 0.66885906457901, "learning_rate": 5e-06, "loss": 0.4758, "step": 1020 }, { "epoch": 2.6208651399491094, "grad_norm": 0.5633515119552612, "learning_rate": 5e-06, "loss": 0.474, "step": 1030 }, { "epoch": 2.6463104325699747, "grad_norm": 0.5127110481262207, "learning_rate": 5e-06, "loss": 0.4792, "step": 1040 }, { "epoch": 2.67175572519084, "grad_norm": 0.6211147308349609, "learning_rate": 5e-06, "loss": 0.4721, "step": 1050 }, { "epoch": 2.6972010178117047, "grad_norm": 0.700775146484375, "learning_rate": 5e-06, "loss": 0.4731, "step": 1060 }, { "epoch": 2.72264631043257, "grad_norm": 0.5291451215744019, "learning_rate": 5e-06, "loss": 0.475, "step": 1070 }, { "epoch": 2.7480916030534353, "grad_norm": 0.5429026484489441, "learning_rate": 5e-06, "loss": 0.4822, "step": 1080 }, { "epoch": 2.7735368956743, "grad_norm": 0.5198021531105042, "learning_rate": 5e-06, "loss": 0.4738, "step": 1090 }, { "epoch": 2.7989821882951653, "grad_norm": 0.4582120478153229, "learning_rate": 5e-06, "loss": 0.4709, "step": 1100 }, { "epoch": 2.8244274809160306, "grad_norm": 0.47587454319000244, "learning_rate": 5e-06, "loss": 0.4692, "step": 1110 }, { "epoch": 2.849872773536896, "grad_norm": 0.5221173167228699, "learning_rate": 5e-06, "loss": 0.4767, "step": 1120 }, { "epoch": 2.875318066157761, "grad_norm": 0.5617064833641052, "learning_rate": 5e-06, "loss": 0.478, "step": 1130 }, { "epoch": 2.900763358778626, "grad_norm": 0.5362435579299927, "learning_rate": 5e-06, "loss": 0.4745, "step": 1140 }, { "epoch": 2.926208651399491, "grad_norm": 0.5266700387001038, "learning_rate": 5e-06, "loss": 0.4725, "step": 1150 }, { "epoch": 2.9516539440203564, "grad_norm": 0.5272775292396545, "learning_rate": 5e-06, "loss": 0.4744, "step": 1160 }, { "epoch": 2.9770992366412212, "grad_norm": 0.6244516372680664, "learning_rate": 5e-06, "loss": 0.4766, "step": 1170 }, { "epoch": 3.0, "eval_loss": 0.5409571528434753, "eval_runtime": 35.9061, "eval_samples_per_second": 294.296, "eval_steps_per_second": 1.17, "step": 1179 }, { "epoch": 3.0, "step": 1179, "total_flos": 5.566873103382662e+19, "train_loss": 0.5357507543102779, "train_runtime": 8300.7415, "train_samples_per_second": 72.558, "train_steps_per_second": 0.142 } ], "logging_steps": 10, "max_steps": 1179, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.566873103382662e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }