{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.995245641838352, "eval_steps": 500, "global_step": 945, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03169572107765452, "grad_norm": 5.002077210650163, "learning_rate": 5e-06, "loss": 0.8641, "step": 10 }, { "epoch": 0.06339144215530904, "grad_norm": 1.0041823875277287, "learning_rate": 5e-06, "loss": 0.7817, "step": 20 }, { "epoch": 0.09508716323296355, "grad_norm": 0.7267029948833413, "learning_rate": 5e-06, "loss": 0.7429, "step": 30 }, { "epoch": 0.12678288431061807, "grad_norm": 0.624153716022588, "learning_rate": 5e-06, "loss": 0.7173, "step": 40 }, { "epoch": 0.15847860538827258, "grad_norm": 0.566309592028478, "learning_rate": 5e-06, "loss": 0.7097, "step": 50 }, { "epoch": 0.1901743264659271, "grad_norm": 0.8668636299502676, "learning_rate": 5e-06, "loss": 0.7121, "step": 60 }, { "epoch": 0.2218700475435816, "grad_norm": 0.684958377999424, "learning_rate": 5e-06, "loss": 0.6997, "step": 70 }, { "epoch": 0.25356576862123614, "grad_norm": 0.5215403890931859, "learning_rate": 5e-06, "loss": 0.6898, "step": 80 }, { "epoch": 0.28526148969889065, "grad_norm": 0.4816858922190366, "learning_rate": 5e-06, "loss": 0.6867, "step": 90 }, { "epoch": 0.31695721077654515, "grad_norm": 0.7154671815551356, "learning_rate": 5e-06, "loss": 0.6748, "step": 100 }, { "epoch": 0.3486529318541997, "grad_norm": 0.4058042716028271, "learning_rate": 5e-06, "loss": 0.6765, "step": 110 }, { "epoch": 0.3803486529318542, "grad_norm": 0.39453817862211543, "learning_rate": 5e-06, "loss": 0.6734, "step": 120 }, { "epoch": 0.4120443740095087, "grad_norm": 0.6699185012363967, "learning_rate": 5e-06, "loss": 0.6736, "step": 130 }, { "epoch": 0.4437400950871632, "grad_norm": 0.7047289653585949, "learning_rate": 5e-06, "loss": 0.6675, "step": 140 }, { "epoch": 0.4754358161648177, "grad_norm": 0.5651557636719974, "learning_rate": 5e-06, "loss": 0.67, "step": 150 }, { "epoch": 0.5071315372424723, "grad_norm": 0.7485262569132396, "learning_rate": 5e-06, "loss": 0.6702, "step": 160 }, { "epoch": 0.5388272583201268, "grad_norm": 0.5937798065160911, "learning_rate": 5e-06, "loss": 0.6684, "step": 170 }, { "epoch": 0.5705229793977813, "grad_norm": 0.42087472284785377, "learning_rate": 5e-06, "loss": 0.6696, "step": 180 }, { "epoch": 0.6022187004754358, "grad_norm": 0.6838376914191113, "learning_rate": 5e-06, "loss": 0.6736, "step": 190 }, { "epoch": 0.6339144215530903, "grad_norm": 0.6779376025706788, "learning_rate": 5e-06, "loss": 0.6568, "step": 200 }, { "epoch": 0.6656101426307448, "grad_norm": 0.5461606311887525, "learning_rate": 5e-06, "loss": 0.6574, "step": 210 }, { "epoch": 0.6973058637083994, "grad_norm": 0.4896365938443348, "learning_rate": 5e-06, "loss": 0.6661, "step": 220 }, { "epoch": 0.7290015847860539, "grad_norm": 0.5061952793250585, "learning_rate": 5e-06, "loss": 0.6568, "step": 230 }, { "epoch": 0.7606973058637084, "grad_norm": 0.5463009018381003, "learning_rate": 5e-06, "loss": 0.6542, "step": 240 }, { "epoch": 0.7923930269413629, "grad_norm": 0.4203063093835648, "learning_rate": 5e-06, "loss": 0.6528, "step": 250 }, { "epoch": 0.8240887480190174, "grad_norm": 0.5364831104473663, "learning_rate": 5e-06, "loss": 0.6596, "step": 260 }, { "epoch": 0.8557844690966719, "grad_norm": 0.5836930133680968, "learning_rate": 5e-06, "loss": 0.6588, "step": 270 }, { "epoch": 0.8874801901743264, "grad_norm": 0.4587582257779609, "learning_rate": 5e-06, "loss": 0.6569, "step": 280 }, { "epoch": 0.919175911251981, "grad_norm": 0.5211716841949762, "learning_rate": 5e-06, "loss": 0.6483, "step": 290 }, { "epoch": 0.9508716323296355, "grad_norm": 0.6487858529916467, "learning_rate": 5e-06, "loss": 0.6492, "step": 300 }, { "epoch": 0.9825673534072901, "grad_norm": 0.4204562899057241, "learning_rate": 5e-06, "loss": 0.6466, "step": 310 }, { "epoch": 0.9984152139461173, "eval_loss": 0.6489410996437073, "eval_runtime": 170.8766, "eval_samples_per_second": 49.72, "eval_steps_per_second": 0.392, "step": 315 }, { "epoch": 1.0142630744849446, "grad_norm": 0.7909658922802345, "learning_rate": 5e-06, "loss": 0.6309, "step": 320 }, { "epoch": 1.045958795562599, "grad_norm": 0.5926743835661953, "learning_rate": 5e-06, "loss": 0.6133, "step": 330 }, { "epoch": 1.0776545166402536, "grad_norm": 0.5086080623346543, "learning_rate": 5e-06, "loss": 0.6184, "step": 340 }, { "epoch": 1.109350237717908, "grad_norm": 0.5244697888828465, "learning_rate": 5e-06, "loss": 0.6175, "step": 350 }, { "epoch": 1.1410459587955626, "grad_norm": 0.5658308922949424, "learning_rate": 5e-06, "loss": 0.6194, "step": 360 }, { "epoch": 1.172741679873217, "grad_norm": 0.503634416974372, "learning_rate": 5e-06, "loss": 0.6139, "step": 370 }, { "epoch": 1.2044374009508716, "grad_norm": 0.5134026739487886, "learning_rate": 5e-06, "loss": 0.6145, "step": 380 }, { "epoch": 1.236133122028526, "grad_norm": 0.6504374930809814, "learning_rate": 5e-06, "loss": 0.6106, "step": 390 }, { "epoch": 1.2678288431061806, "grad_norm": 0.5488394685032496, "learning_rate": 5e-06, "loss": 0.6178, "step": 400 }, { "epoch": 1.299524564183835, "grad_norm": 0.5637848064827178, "learning_rate": 5e-06, "loss": 0.6139, "step": 410 }, { "epoch": 1.3312202852614896, "grad_norm": 0.4929677114414056, "learning_rate": 5e-06, "loss": 0.6115, "step": 420 }, { "epoch": 1.3629160063391441, "grad_norm": 0.5260476600034447, "learning_rate": 5e-06, "loss": 0.6097, "step": 430 }, { "epoch": 1.3946117274167986, "grad_norm": 0.44727265179318326, "learning_rate": 5e-06, "loss": 0.6092, "step": 440 }, { "epoch": 1.4263074484944531, "grad_norm": 0.601618414738924, "learning_rate": 5e-06, "loss": 0.6163, "step": 450 }, { "epoch": 1.4580031695721076, "grad_norm": 0.49038888044450907, "learning_rate": 5e-06, "loss": 0.6095, "step": 460 }, { "epoch": 1.4896988906497624, "grad_norm": 0.5097913207122167, "learning_rate": 5e-06, "loss": 0.6136, "step": 470 }, { "epoch": 1.5213946117274166, "grad_norm": 0.6535026695551385, "learning_rate": 5e-06, "loss": 0.6131, "step": 480 }, { "epoch": 1.5530903328050714, "grad_norm": 0.5478546695070423, "learning_rate": 5e-06, "loss": 0.6117, "step": 490 }, { "epoch": 1.5847860538827259, "grad_norm": 0.47440853445302006, "learning_rate": 5e-06, "loss": 0.6275, "step": 500 }, { "epoch": 1.6164817749603804, "grad_norm": 0.47242075964062596, "learning_rate": 5e-06, "loss": 0.6185, "step": 510 }, { "epoch": 1.6481774960380349, "grad_norm": 0.4932550522259784, "learning_rate": 5e-06, "loss": 0.6188, "step": 520 }, { "epoch": 1.6798732171156894, "grad_norm": 0.49155732750425185, "learning_rate": 5e-06, "loss": 0.618, "step": 530 }, { "epoch": 1.7115689381933439, "grad_norm": 0.3852759456359732, "learning_rate": 5e-06, "loss": 0.6108, "step": 540 }, { "epoch": 1.7432646592709984, "grad_norm": 0.45204407263371044, "learning_rate": 5e-06, "loss": 0.6128, "step": 550 }, { "epoch": 1.7749603803486529, "grad_norm": 0.45233482856711393, "learning_rate": 5e-06, "loss": 0.6129, "step": 560 }, { "epoch": 1.8066561014263076, "grad_norm": 0.40723601066321924, "learning_rate": 5e-06, "loss": 0.6141, "step": 570 }, { "epoch": 1.8383518225039621, "grad_norm": 0.49021896910219537, "learning_rate": 5e-06, "loss": 0.6048, "step": 580 }, { "epoch": 1.8700475435816166, "grad_norm": 0.4137149099162348, "learning_rate": 5e-06, "loss": 0.6069, "step": 590 }, { "epoch": 1.9017432646592711, "grad_norm": 0.43600015726361796, "learning_rate": 5e-06, "loss": 0.6127, "step": 600 }, { "epoch": 1.9334389857369256, "grad_norm": 0.5983745339259906, "learning_rate": 5e-06, "loss": 0.6119, "step": 610 }, { "epoch": 1.9651347068145801, "grad_norm": 0.44191449750213707, "learning_rate": 5e-06, "loss": 0.6105, "step": 620 }, { "epoch": 1.9968304278922346, "grad_norm": 0.41833944166046033, "learning_rate": 5e-06, "loss": 0.612, "step": 630 }, { "epoch": 2.0, "eval_loss": 0.6384235620498657, "eval_runtime": 172.0031, "eval_samples_per_second": 49.394, "eval_steps_per_second": 0.39, "step": 631 }, { "epoch": 2.028526148969889, "grad_norm": 0.502856771702954, "learning_rate": 5e-06, "loss": 0.5748, "step": 640 }, { "epoch": 2.0602218700475436, "grad_norm": 0.47143025065214356, "learning_rate": 5e-06, "loss": 0.573, "step": 650 }, { "epoch": 2.091917591125198, "grad_norm": 0.4378418090102208, "learning_rate": 5e-06, "loss": 0.573, "step": 660 }, { "epoch": 2.1236133122028527, "grad_norm": 0.6058133875665858, "learning_rate": 5e-06, "loss": 0.5783, "step": 670 }, { "epoch": 2.155309033280507, "grad_norm": 0.47371009100624967, "learning_rate": 5e-06, "loss": 0.5698, "step": 680 }, { "epoch": 2.1870047543581617, "grad_norm": 0.4991361347258838, "learning_rate": 5e-06, "loss": 0.5732, "step": 690 }, { "epoch": 2.218700475435816, "grad_norm": 0.5359340720470293, "learning_rate": 5e-06, "loss": 0.5699, "step": 700 }, { "epoch": 2.2503961965134707, "grad_norm": 0.4893107425446941, "learning_rate": 5e-06, "loss": 0.5751, "step": 710 }, { "epoch": 2.282091917591125, "grad_norm": 0.44259860353438574, "learning_rate": 5e-06, "loss": 0.57, "step": 720 }, { "epoch": 2.3137876386687797, "grad_norm": 0.7444991766460172, "learning_rate": 5e-06, "loss": 0.5714, "step": 730 }, { "epoch": 2.345483359746434, "grad_norm": 0.39951135529059456, "learning_rate": 5e-06, "loss": 0.577, "step": 740 }, { "epoch": 2.3771790808240887, "grad_norm": 0.4742761127635493, "learning_rate": 5e-06, "loss": 0.5782, "step": 750 }, { "epoch": 2.408874801901743, "grad_norm": 0.585898905502293, "learning_rate": 5e-06, "loss": 0.5755, "step": 760 }, { "epoch": 2.4405705229793977, "grad_norm": 0.6170117233151285, "learning_rate": 5e-06, "loss": 0.5752, "step": 770 }, { "epoch": 2.472266244057052, "grad_norm": 0.45913749051577313, "learning_rate": 5e-06, "loss": 0.5723, "step": 780 }, { "epoch": 2.5039619651347067, "grad_norm": 0.4584152935889332, "learning_rate": 5e-06, "loss": 0.5679, "step": 790 }, { "epoch": 2.535657686212361, "grad_norm": 0.42856693615532904, "learning_rate": 5e-06, "loss": 0.5744, "step": 800 }, { "epoch": 2.5673534072900157, "grad_norm": 0.5273756256360088, "learning_rate": 5e-06, "loss": 0.5794, "step": 810 }, { "epoch": 2.59904912836767, "grad_norm": 0.46899833087451076, "learning_rate": 5e-06, "loss": 0.5709, "step": 820 }, { "epoch": 2.6307448494453247, "grad_norm": 0.484076993070708, "learning_rate": 5e-06, "loss": 0.5759, "step": 830 }, { "epoch": 2.662440570522979, "grad_norm": 0.4349705497114564, "learning_rate": 5e-06, "loss": 0.5752, "step": 840 }, { "epoch": 2.6941362916006337, "grad_norm": 0.5194459175036173, "learning_rate": 5e-06, "loss": 0.5724, "step": 850 }, { "epoch": 2.7258320126782882, "grad_norm": 0.5337441754498483, "learning_rate": 5e-06, "loss": 0.574, "step": 860 }, { "epoch": 2.7575277337559427, "grad_norm": 0.5009650532937617, "learning_rate": 5e-06, "loss": 0.5772, "step": 870 }, { "epoch": 2.7892234548335972, "grad_norm": 0.46031841411878416, "learning_rate": 5e-06, "loss": 0.5817, "step": 880 }, { "epoch": 2.8209191759112517, "grad_norm": 0.5673120105732073, "learning_rate": 5e-06, "loss": 0.5808, "step": 890 }, { "epoch": 2.8526148969889062, "grad_norm": 0.614359093779175, "learning_rate": 5e-06, "loss": 0.5801, "step": 900 }, { "epoch": 2.8843106180665607, "grad_norm": 0.5019306735450663, "learning_rate": 5e-06, "loss": 0.5813, "step": 910 }, { "epoch": 2.9160063391442153, "grad_norm": 0.43396332531042875, "learning_rate": 5e-06, "loss": 0.576, "step": 920 }, { "epoch": 2.94770206022187, "grad_norm": 0.4457359128495874, "learning_rate": 5e-06, "loss": 0.5798, "step": 930 }, { "epoch": 2.9793977812995247, "grad_norm": 0.4575875895533821, "learning_rate": 5e-06, "loss": 0.5816, "step": 940 }, { "epoch": 2.995245641838352, "eval_loss": 0.637653648853302, "eval_runtime": 171.7653, "eval_samples_per_second": 49.463, "eval_steps_per_second": 0.39, "step": 945 }, { "epoch": 2.995245641838352, "step": 945, "total_flos": 1582491437629440.0, "train_loss": 0.6238129633444327, "train_runtime": 28593.492, "train_samples_per_second": 16.935, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 945, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1582491437629440.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }