{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.013019612744638524, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002603922548927705, "grad_norm": 0.8521247506141663, "learning_rate": 5.194805194805195e-06, "loss": 0.7412, "step": 10 }, { "epoch": 0.000520784509785541, "grad_norm": 0.6229312419891357, "learning_rate": 1.038961038961039e-05, "loss": 0.7138, "step": 20 }, { "epoch": 0.0007811767646783114, "grad_norm": 0.4566498100757599, "learning_rate": 1.5584415584415583e-05, "loss": 0.7079, "step": 30 }, { "epoch": 0.001041569019571082, "grad_norm": 0.4316692650318146, "learning_rate": 2.077922077922078e-05, "loss": 0.6988, "step": 40 }, { "epoch": 0.0013019612744638524, "grad_norm": 0.615436315536499, "learning_rate": 2.5974025974025972e-05, "loss": 0.6937, "step": 50 }, { "epoch": 0.0015623535293566228, "grad_norm": 0.48698583245277405, "learning_rate": 3.1168831168831166e-05, "loss": 0.7043, "step": 60 }, { "epoch": 0.0018227457842493933, "grad_norm": 0.3984021544456482, "learning_rate": 3.6363636363636364e-05, "loss": 0.6563, "step": 70 }, { "epoch": 0.002083138039142164, "grad_norm": 0.37576180696487427, "learning_rate": 4.155844155844156e-05, "loss": 0.6462, "step": 80 }, { "epoch": 0.0023435302940349343, "grad_norm": 0.35269680619239807, "learning_rate": 4.675324675324675e-05, "loss": 0.6656, "step": 90 }, { "epoch": 0.0026039225489277048, "grad_norm": 0.31541451811790466, "learning_rate": 5.1948051948051944e-05, "loss": 0.6547, "step": 100 }, { "epoch": 0.002864314803820475, "grad_norm": 0.3462330400943756, "learning_rate": 5.714285714285714e-05, "loss": 0.6621, "step": 110 }, { "epoch": 0.0031247070587132456, "grad_norm": 0.3465985953807831, "learning_rate": 6.233766233766233e-05, "loss": 0.6273, "step": 120 }, { "epoch": 0.003385099313606016, "grad_norm": 0.3297797441482544, "learning_rate": 6.753246753246754e-05, "loss": 0.6559, "step": 130 }, { "epoch": 0.0036454915684987865, "grad_norm": 0.3888818621635437, "learning_rate": 7.272727272727273e-05, "loss": 0.6756, "step": 140 }, { "epoch": 0.003905883823391557, "grad_norm": 0.3542368710041046, "learning_rate": 7.792207792207793e-05, "loss": 0.6506, "step": 150 }, { "epoch": 0.004166276078284328, "grad_norm": 0.37369370460510254, "learning_rate": 8.311688311688312e-05, "loss": 0.6645, "step": 160 }, { "epoch": 0.004426668333177098, "grad_norm": 0.3700549900531769, "learning_rate": 8.831168831168831e-05, "loss": 0.6727, "step": 170 }, { "epoch": 0.004687060588069869, "grad_norm": 0.32032889127731323, "learning_rate": 9.35064935064935e-05, "loss": 0.6529, "step": 180 }, { "epoch": 0.004947452842962639, "grad_norm": 0.3331650495529175, "learning_rate": 9.870129870129871e-05, "loss": 0.6627, "step": 190 }, { "epoch": 0.0052078450978554095, "grad_norm": 0.3300645351409912, "learning_rate": 0.00010389610389610389, "loss": 0.676, "step": 200 }, { "epoch": 0.0054682373527481795, "grad_norm": 0.350356787443161, "learning_rate": 0.00010909090909090909, "loss": 0.6564, "step": 210 }, { "epoch": 0.00572862960764095, "grad_norm": 0.382756769657135, "learning_rate": 0.00011428571428571428, "loss": 0.6243, "step": 220 }, { "epoch": 0.00598902186253372, "grad_norm": 0.34450188279151917, "learning_rate": 0.00011948051948051949, "loss": 0.6611, "step": 230 }, { "epoch": 0.006249414117426491, "grad_norm": 0.3705821633338928, "learning_rate": 0.00012467532467532467, "loss": 0.6384, "step": 240 }, { "epoch": 0.006509806372319262, "grad_norm": 0.36822304129600525, "learning_rate": 0.00012987012987012987, "loss": 0.6415, "step": 250 }, { "epoch": 0.006770198627212032, "grad_norm": 0.32358303666114807, "learning_rate": 0.00013506493506493507, "loss": 0.6584, "step": 260 }, { "epoch": 0.007030590882104803, "grad_norm": 0.33386844396591187, "learning_rate": 0.00014025974025974028, "loss": 0.6702, "step": 270 }, { "epoch": 0.007290983136997573, "grad_norm": 0.32447949051856995, "learning_rate": 0.00014545454545454546, "loss": 0.6519, "step": 280 }, { "epoch": 0.007551375391890344, "grad_norm": 0.3388073146343231, "learning_rate": 0.00015064935064935066, "loss": 0.6735, "step": 290 }, { "epoch": 0.007811767646783114, "grad_norm": 0.39655518531799316, "learning_rate": 0.00015584415584415587, "loss": 0.672, "step": 300 }, { "epoch": 0.008072159901675884, "grad_norm": 0.41258928179740906, "learning_rate": 0.00016103896103896104, "loss": 0.6626, "step": 310 }, { "epoch": 0.008332552156568656, "grad_norm": 0.3963010311126709, "learning_rate": 0.00016623376623376625, "loss": 0.6653, "step": 320 }, { "epoch": 0.008592944411461426, "grad_norm": 0.3641106188297272, "learning_rate": 0.00017142857142857143, "loss": 0.6389, "step": 330 }, { "epoch": 0.008853336666354196, "grad_norm": 0.38745763897895813, "learning_rate": 0.00017662337662337663, "loss": 0.6928, "step": 340 }, { "epoch": 0.009113728921246966, "grad_norm": 0.4573372006416321, "learning_rate": 0.00018181818181818183, "loss": 0.6679, "step": 350 }, { "epoch": 0.009374121176139737, "grad_norm": 0.45714282989501953, "learning_rate": 0.000187012987012987, "loss": 0.6453, "step": 360 }, { "epoch": 0.009634513431032507, "grad_norm": 0.37631818652153015, "learning_rate": 0.00019220779220779222, "loss": 0.6467, "step": 370 }, { "epoch": 0.009894905685925277, "grad_norm": 0.3658345639705658, "learning_rate": 0.00019740259740259742, "loss": 0.6631, "step": 380 }, { "epoch": 0.010155297940818049, "grad_norm": 0.3953540623188019, "learning_rate": 0.00019999996515752773, "loss": 0.6573, "step": 390 }, { "epoch": 0.010415690195710819, "grad_norm": 0.377763569355011, "learning_rate": 0.00019999968641789507, "loss": 0.6664, "step": 400 }, { "epoch": 0.010676082450603589, "grad_norm": 0.37128835916519165, "learning_rate": 0.0001999991289394067, "loss": 0.6342, "step": 410 }, { "epoch": 0.010936474705496359, "grad_norm": 0.33881694078445435, "learning_rate": 0.00019999829272361654, "loss": 0.6476, "step": 420 }, { "epoch": 0.01119686696038913, "grad_norm": 0.39774075150489807, "learning_rate": 0.00019999717777285545, "loss": 0.633, "step": 430 }, { "epoch": 0.0114572592152819, "grad_norm": 0.41350051760673523, "learning_rate": 0.00019999578409023126, "loss": 0.6541, "step": 440 }, { "epoch": 0.01171765147017467, "grad_norm": 0.47954171895980835, "learning_rate": 0.00019999411167962868, "loss": 0.6545, "step": 450 }, { "epoch": 0.01197804372506744, "grad_norm": 0.46860000491142273, "learning_rate": 0.00019999216054570942, "loss": 0.6512, "step": 460 }, { "epoch": 0.012238435979960213, "grad_norm": 0.4395809471607208, "learning_rate": 0.00019998993069391205, "loss": 0.6587, "step": 470 }, { "epoch": 0.012498828234852983, "grad_norm": 0.43222516775131226, "learning_rate": 0.00019998742213045206, "loss": 0.6292, "step": 480 }, { "epoch": 0.012759220489745753, "grad_norm": 0.39363613724708557, "learning_rate": 0.00019998463486232179, "loss": 0.6319, "step": 490 }, { "epoch": 0.013019612744638524, "grad_norm": 0.4984697699546814, "learning_rate": 0.0001999815688972905, "loss": 0.6488, "step": 500 } ], "logging_steps": 10, "max_steps": 19202, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8290205089792e+17, "train_batch_size": 3, "trial_name": null, "trial_params": null }