{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1092, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027472527472527472, "grad_norm": 12.728795051574707, "learning_rate": 5e-06, "loss": 0.9765, "step": 10 }, { "epoch": 0.054945054945054944, "grad_norm": 2.706224203109741, "learning_rate": 5e-06, "loss": 0.839, "step": 20 }, { "epoch": 0.08241758241758242, "grad_norm": 1.059246301651001, "learning_rate": 5e-06, "loss": 0.7854, "step": 30 }, { "epoch": 0.10989010989010989, "grad_norm": 0.8609521389007568, "learning_rate": 5e-06, "loss": 0.7552, "step": 40 }, { "epoch": 0.13736263736263737, "grad_norm": 0.8304686546325684, "learning_rate": 5e-06, "loss": 0.7404, "step": 50 }, { "epoch": 0.16483516483516483, "grad_norm": 0.95517498254776, "learning_rate": 5e-06, "loss": 0.717, "step": 60 }, { "epoch": 0.19230769230769232, "grad_norm": 0.8028889298439026, "learning_rate": 5e-06, "loss": 0.7069, "step": 70 }, { "epoch": 0.21978021978021978, "grad_norm": 1.287640929222107, "learning_rate": 5e-06, "loss": 0.6991, "step": 80 }, { "epoch": 0.24725274725274726, "grad_norm": 2.1982975006103516, "learning_rate": 5e-06, "loss": 0.6911, "step": 90 }, { "epoch": 0.27472527472527475, "grad_norm": 0.6827638149261475, "learning_rate": 5e-06, "loss": 0.6901, "step": 100 }, { "epoch": 0.3021978021978022, "grad_norm": 0.9066330194473267, "learning_rate": 5e-06, "loss": 0.6826, "step": 110 }, { "epoch": 0.32967032967032966, "grad_norm": 0.6553434133529663, "learning_rate": 5e-06, "loss": 0.6863, "step": 120 }, { "epoch": 0.35714285714285715, "grad_norm": 0.9135424494743347, "learning_rate": 5e-06, "loss": 0.6745, "step": 130 }, { "epoch": 0.38461538461538464, "grad_norm": 0.7348975539207458, "learning_rate": 5e-06, "loss": 0.6759, "step": 140 }, { "epoch": 0.41208791208791207, "grad_norm": 0.9071689248085022, "learning_rate": 5e-06, "loss": 0.6673, "step": 150 }, { "epoch": 0.43956043956043955, "grad_norm": 0.7530332207679749, "learning_rate": 5e-06, "loss": 0.6651, "step": 160 }, { "epoch": 0.46703296703296704, "grad_norm": 0.7991921305656433, "learning_rate": 5e-06, "loss": 0.6716, "step": 170 }, { "epoch": 0.4945054945054945, "grad_norm": 0.8702341318130493, "learning_rate": 5e-06, "loss": 0.6589, "step": 180 }, { "epoch": 0.521978021978022, "grad_norm": 0.5698891878128052, "learning_rate": 5e-06, "loss": 0.6618, "step": 190 }, { "epoch": 0.5494505494505495, "grad_norm": 0.5574747323989868, "learning_rate": 5e-06, "loss": 0.6629, "step": 200 }, { "epoch": 0.5769230769230769, "grad_norm": 0.6472902894020081, "learning_rate": 5e-06, "loss": 0.6582, "step": 210 }, { "epoch": 0.6043956043956044, "grad_norm": 0.5712385177612305, "learning_rate": 5e-06, "loss": 0.6551, "step": 220 }, { "epoch": 0.6318681318681318, "grad_norm": 0.492366224527359, "learning_rate": 5e-06, "loss": 0.6485, "step": 230 }, { "epoch": 0.6593406593406593, "grad_norm": 0.7048478126525879, "learning_rate": 5e-06, "loss": 0.6484, "step": 240 }, { "epoch": 0.6868131868131868, "grad_norm": 0.537118136882782, "learning_rate": 5e-06, "loss": 0.6488, "step": 250 }, { "epoch": 0.7142857142857143, "grad_norm": 0.49864494800567627, "learning_rate": 5e-06, "loss": 0.6463, "step": 260 }, { "epoch": 0.7417582417582418, "grad_norm": 0.7438676953315735, "learning_rate": 5e-06, "loss": 0.6498, "step": 270 }, { "epoch": 0.7692307692307693, "grad_norm": 0.5052060484886169, "learning_rate": 5e-06, "loss": 0.6464, "step": 280 }, { "epoch": 0.7967032967032966, "grad_norm": 0.548953115940094, "learning_rate": 5e-06, "loss": 0.6441, "step": 290 }, { "epoch": 0.8241758241758241, "grad_norm": 0.6289977431297302, "learning_rate": 5e-06, "loss": 0.6415, "step": 300 }, { "epoch": 0.8516483516483516, "grad_norm": 0.5599405765533447, "learning_rate": 5e-06, "loss": 0.6419, "step": 310 }, { "epoch": 0.8791208791208791, "grad_norm": 0.6185189485549927, "learning_rate": 5e-06, "loss": 0.6371, "step": 320 }, { "epoch": 0.9065934065934066, "grad_norm": 0.5025250911712646, "learning_rate": 5e-06, "loss": 0.6388, "step": 330 }, { "epoch": 0.9340659340659341, "grad_norm": 0.596026599407196, "learning_rate": 5e-06, "loss": 0.6371, "step": 340 }, { "epoch": 0.9615384615384616, "grad_norm": 0.5150969624519348, "learning_rate": 5e-06, "loss": 0.6323, "step": 350 }, { "epoch": 0.989010989010989, "grad_norm": 0.5902632474899292, "learning_rate": 5e-06, "loss": 0.6304, "step": 360 }, { "epoch": 1.0, "eval_loss": 0.6319476366043091, "eval_runtime": 33.2599, "eval_samples_per_second": 294.589, "eval_steps_per_second": 1.173, "step": 364 }, { "epoch": 1.0164835164835164, "grad_norm": 0.684147298336029, "learning_rate": 5e-06, "loss": 0.6119, "step": 370 }, { "epoch": 1.043956043956044, "grad_norm": 0.6040191054344177, "learning_rate": 5e-06, "loss": 0.5939, "step": 380 }, { "epoch": 1.0714285714285714, "grad_norm": 0.6056487560272217, "learning_rate": 5e-06, "loss": 0.5955, "step": 390 }, { "epoch": 1.098901098901099, "grad_norm": 0.6380589008331299, "learning_rate": 5e-06, "loss": 0.5917, "step": 400 }, { "epoch": 1.1263736263736264, "grad_norm": 0.734009861946106, "learning_rate": 5e-06, "loss": 0.5912, "step": 410 }, { "epoch": 1.1538461538461537, "grad_norm": 0.5856278538703918, "learning_rate": 5e-06, "loss": 0.5887, "step": 420 }, { "epoch": 1.1813186813186813, "grad_norm": 0.5818033218383789, "learning_rate": 5e-06, "loss": 0.5959, "step": 430 }, { "epoch": 1.2087912087912087, "grad_norm": 0.5466902852058411, "learning_rate": 5e-06, "loss": 0.5927, "step": 440 }, { "epoch": 1.2362637362637363, "grad_norm": 0.5269283056259155, "learning_rate": 5e-06, "loss": 0.5928, "step": 450 }, { "epoch": 1.2637362637362637, "grad_norm": 0.5852891802787781, "learning_rate": 5e-06, "loss": 0.5888, "step": 460 }, { "epoch": 1.2912087912087913, "grad_norm": 0.5838508605957031, "learning_rate": 5e-06, "loss": 0.5868, "step": 470 }, { "epoch": 1.3186813186813187, "grad_norm": 0.5828148722648621, "learning_rate": 5e-06, "loss": 0.5881, "step": 480 }, { "epoch": 1.3461538461538463, "grad_norm": 0.5466414093971252, "learning_rate": 5e-06, "loss": 0.5844, "step": 490 }, { "epoch": 1.3736263736263736, "grad_norm": 0.6251242160797119, "learning_rate": 5e-06, "loss": 0.5867, "step": 500 }, { "epoch": 1.401098901098901, "grad_norm": 0.6761026382446289, "learning_rate": 5e-06, "loss": 0.5878, "step": 510 }, { "epoch": 1.4285714285714286, "grad_norm": 0.8003695011138916, "learning_rate": 5e-06, "loss": 0.5858, "step": 520 }, { "epoch": 1.456043956043956, "grad_norm": 0.6401771903038025, "learning_rate": 5e-06, "loss": 0.5859, "step": 530 }, { "epoch": 1.4835164835164836, "grad_norm": 0.722094714641571, "learning_rate": 5e-06, "loss": 0.5885, "step": 540 }, { "epoch": 1.510989010989011, "grad_norm": 0.6249004006385803, "learning_rate": 5e-06, "loss": 0.5846, "step": 550 }, { "epoch": 1.5384615384615383, "grad_norm": 0.5867748856544495, "learning_rate": 5e-06, "loss": 0.5841, "step": 560 }, { "epoch": 1.565934065934066, "grad_norm": 0.6063013672828674, "learning_rate": 5e-06, "loss": 0.5854, "step": 570 }, { "epoch": 1.5934065934065935, "grad_norm": 0.538633406162262, "learning_rate": 5e-06, "loss": 0.5812, "step": 580 }, { "epoch": 1.620879120879121, "grad_norm": 0.6249384880065918, "learning_rate": 5e-06, "loss": 0.5802, "step": 590 }, { "epoch": 1.6483516483516483, "grad_norm": 0.5973653197288513, "learning_rate": 5e-06, "loss": 0.5848, "step": 600 }, { "epoch": 1.6758241758241759, "grad_norm": 0.5950356125831604, "learning_rate": 5e-06, "loss": 0.5807, "step": 610 }, { "epoch": 1.7032967032967035, "grad_norm": 0.5238327980041504, "learning_rate": 5e-06, "loss": 0.5753, "step": 620 }, { "epoch": 1.7307692307692308, "grad_norm": 0.5083211660385132, "learning_rate": 5e-06, "loss": 0.5801, "step": 630 }, { "epoch": 1.7582417582417582, "grad_norm": 0.8316366076469421, "learning_rate": 5e-06, "loss": 0.5745, "step": 640 }, { "epoch": 1.7857142857142856, "grad_norm": 0.5875478386878967, "learning_rate": 5e-06, "loss": 0.58, "step": 650 }, { "epoch": 1.8131868131868132, "grad_norm": 0.5625612735748291, "learning_rate": 5e-06, "loss": 0.5721, "step": 660 }, { "epoch": 1.8406593406593408, "grad_norm": 0.5859466791152954, "learning_rate": 5e-06, "loss": 0.5784, "step": 670 }, { "epoch": 1.8681318681318682, "grad_norm": 0.5293060541152954, "learning_rate": 5e-06, "loss": 0.575, "step": 680 }, { "epoch": 1.8956043956043955, "grad_norm": 0.516545832157135, "learning_rate": 5e-06, "loss": 0.5771, "step": 690 }, { "epoch": 1.9230769230769231, "grad_norm": 0.5076315999031067, "learning_rate": 5e-06, "loss": 0.5759, "step": 700 }, { "epoch": 1.9505494505494505, "grad_norm": 0.46912071108818054, "learning_rate": 5e-06, "loss": 0.5738, "step": 710 }, { "epoch": 1.978021978021978, "grad_norm": 0.5850493907928467, "learning_rate": 5e-06, "loss": 0.5776, "step": 720 }, { "epoch": 2.0, "eval_loss": 0.6049710512161255, "eval_runtime": 33.2692, "eval_samples_per_second": 294.506, "eval_steps_per_second": 1.172, "step": 728 }, { "epoch": 2.0054945054945055, "grad_norm": 0.8460399508476257, "learning_rate": 5e-06, "loss": 0.5647, "step": 730 }, { "epoch": 2.032967032967033, "grad_norm": 0.5859711170196533, "learning_rate": 5e-06, "loss": 0.5343, "step": 740 }, { "epoch": 2.0604395604395602, "grad_norm": 0.5478792190551758, "learning_rate": 5e-06, "loss": 0.5346, "step": 750 }, { "epoch": 2.087912087912088, "grad_norm": 0.5353866815567017, "learning_rate": 5e-06, "loss": 0.5386, "step": 760 }, { "epoch": 2.1153846153846154, "grad_norm": 0.589070737361908, "learning_rate": 5e-06, "loss": 0.5364, "step": 770 }, { "epoch": 2.142857142857143, "grad_norm": 0.7154620289802551, "learning_rate": 5e-06, "loss": 0.5357, "step": 780 }, { "epoch": 2.17032967032967, "grad_norm": 0.5796002149581909, "learning_rate": 5e-06, "loss": 0.532, "step": 790 }, { "epoch": 2.197802197802198, "grad_norm": 0.5556730031967163, "learning_rate": 5e-06, "loss": 0.5389, "step": 800 }, { "epoch": 2.2252747252747254, "grad_norm": 0.5793793201446533, "learning_rate": 5e-06, "loss": 0.5323, "step": 810 }, { "epoch": 2.2527472527472527, "grad_norm": 0.6171457767486572, "learning_rate": 5e-06, "loss": 0.5325, "step": 820 }, { "epoch": 2.28021978021978, "grad_norm": 0.563108503818512, "learning_rate": 5e-06, "loss": 0.5305, "step": 830 }, { "epoch": 2.3076923076923075, "grad_norm": 0.6023669242858887, "learning_rate": 5e-06, "loss": 0.5336, "step": 840 }, { "epoch": 2.3351648351648353, "grad_norm": 0.6838552355766296, "learning_rate": 5e-06, "loss": 0.5299, "step": 850 }, { "epoch": 2.3626373626373627, "grad_norm": 0.5465499758720398, "learning_rate": 5e-06, "loss": 0.5352, "step": 860 }, { "epoch": 2.39010989010989, "grad_norm": 0.6073447465896606, "learning_rate": 5e-06, "loss": 0.534, "step": 870 }, { "epoch": 2.4175824175824174, "grad_norm": 0.5182047486305237, "learning_rate": 5e-06, "loss": 0.5317, "step": 880 }, { "epoch": 2.4450549450549453, "grad_norm": 0.543487548828125, "learning_rate": 5e-06, "loss": 0.5381, "step": 890 }, { "epoch": 2.4725274725274726, "grad_norm": 0.5238776803016663, "learning_rate": 5e-06, "loss": 0.5381, "step": 900 }, { "epoch": 2.5, "grad_norm": 0.5654677152633667, "learning_rate": 5e-06, "loss": 0.5316, "step": 910 }, { "epoch": 2.5274725274725274, "grad_norm": 0.6107837557792664, "learning_rate": 5e-06, "loss": 0.5368, "step": 920 }, { "epoch": 2.5549450549450547, "grad_norm": 0.5588423609733582, "learning_rate": 5e-06, "loss": 0.5402, "step": 930 }, { "epoch": 2.5824175824175826, "grad_norm": 0.5652673840522766, "learning_rate": 5e-06, "loss": 0.5382, "step": 940 }, { "epoch": 2.60989010989011, "grad_norm": 0.5125435590744019, "learning_rate": 5e-06, "loss": 0.539, "step": 950 }, { "epoch": 2.6373626373626373, "grad_norm": 0.5421128273010254, "learning_rate": 5e-06, "loss": 0.5335, "step": 960 }, { "epoch": 2.6648351648351647, "grad_norm": 0.5882076621055603, "learning_rate": 5e-06, "loss": 0.5341, "step": 970 }, { "epoch": 2.6923076923076925, "grad_norm": 0.6641466021537781, "learning_rate": 5e-06, "loss": 0.5301, "step": 980 }, { "epoch": 2.71978021978022, "grad_norm": 0.5694056153297424, "learning_rate": 5e-06, "loss": 0.537, "step": 990 }, { "epoch": 2.7472527472527473, "grad_norm": 0.48833751678466797, "learning_rate": 5e-06, "loss": 0.5333, "step": 1000 }, { "epoch": 2.7747252747252746, "grad_norm": 0.5480605959892273, "learning_rate": 5e-06, "loss": 0.5352, "step": 1010 }, { "epoch": 2.802197802197802, "grad_norm": 0.6873406767845154, "learning_rate": 5e-06, "loss": 0.5389, "step": 1020 }, { "epoch": 2.82967032967033, "grad_norm": 0.5708329081535339, "learning_rate": 5e-06, "loss": 0.5346, "step": 1030 }, { "epoch": 2.857142857142857, "grad_norm": 0.5935104489326477, "learning_rate": 5e-06, "loss": 0.5331, "step": 1040 }, { "epoch": 2.8846153846153846, "grad_norm": 0.5802971124649048, "learning_rate": 5e-06, "loss": 0.53, "step": 1050 }, { "epoch": 2.912087912087912, "grad_norm": 0.5741780400276184, "learning_rate": 5e-06, "loss": 0.5291, "step": 1060 }, { "epoch": 2.9395604395604398, "grad_norm": 0.578640878200531, "learning_rate": 5e-06, "loss": 0.5326, "step": 1070 }, { "epoch": 2.967032967032967, "grad_norm": 0.5405354499816895, "learning_rate": 5e-06, "loss": 0.5314, "step": 1080 }, { "epoch": 2.9945054945054945, "grad_norm": 0.5508515238761902, "learning_rate": 5e-06, "loss": 0.5347, "step": 1090 }, { "epoch": 3.0, "eval_loss": 0.5989904403686523, "eval_runtime": 33.1874, "eval_samples_per_second": 295.232, "eval_steps_per_second": 1.175, "step": 1092 }, { "epoch": 3.0, "step": 1092, "total_flos": 5.156086030431106e+19, "train_loss": 0.6005454126731816, "train_runtime": 7773.4306, "train_samples_per_second": 71.841, "train_steps_per_second": 0.14 } ], "logging_steps": 10, "max_steps": 1092, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.156086030431106e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }