{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 219, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0045662100456621, "grad_norm": 1.1796875, "learning_rate": 9.090909090909091e-06, "loss": 2.2848, "step": 1 }, { "epoch": 0.0228310502283105, "grad_norm": 1.15625, "learning_rate": 4.545454545454546e-05, "loss": 2.315, "step": 5 }, { "epoch": 0.045662100456621, "grad_norm": 1.3984375, "learning_rate": 9.090909090909092e-05, "loss": 2.1254, "step": 10 }, { "epoch": 0.0684931506849315, "grad_norm": 1.0234375, "learning_rate": 0.00013636363636363637, "loss": 1.8718, "step": 15 }, { "epoch": 0.091324200913242, "grad_norm": 0.52734375, "learning_rate": 0.00018181818181818183, "loss": 1.7405, "step": 20 }, { "epoch": 0.1141552511415525, "grad_norm": 0.4765625, "learning_rate": 0.00019988558131018186, "loss": 1.5821, "step": 25 }, { "epoch": 0.136986301369863, "grad_norm": 0.423828125, "learning_rate": 0.00019918730395931649, "loss": 1.4932, "step": 30 }, { "epoch": 0.1598173515981735, "grad_norm": 0.35546875, "learning_rate": 0.00019785874696801202, "loss": 1.426, "step": 35 }, { "epoch": 0.182648401826484, "grad_norm": 0.40234375, "learning_rate": 0.00019590835257019714, "loss": 1.4794, "step": 40 }, { "epoch": 0.2054794520547945, "grad_norm": 1.59375, "learning_rate": 0.00019334851442746664, "loss": 1.4378, "step": 45 }, { "epoch": 0.228310502283105, "grad_norm": 0.337890625, "learning_rate": 0.00019019549887431877, "loss": 1.3208, "step": 50 }, { "epoch": 0.2511415525114155, "grad_norm": 0.3125, "learning_rate": 0.00018646934155473022, "loss": 1.3175, "step": 55 }, { "epoch": 0.273972602739726, "grad_norm": 0.2890625, "learning_rate": 0.00018219372010688515, "loss": 1.2908, "step": 60 }, { "epoch": 0.2968036529680365, "grad_norm": 96.0, "learning_rate": 0.00017739580370507532, "loss": 1.2896, "step": 65 }, { "epoch": 0.319634703196347, "grad_norm": 0.29296875, "learning_rate": 0.0001721060804148482, "loss": 1.2896, "step": 70 }, { "epoch": 0.3424657534246575, "grad_norm": 0.318359375, "learning_rate": 0.0001663581634584641, "loss": 1.282, "step": 75 }, { "epoch": 0.365296803652968, "grad_norm": 0.33203125, "learning_rate": 0.0001601885776217367, "loss": 1.2558, "step": 80 }, { "epoch": 0.3881278538812785, "grad_norm": 0.337890625, "learning_rate": 0.0001536365271595212, "loss": 1.2523, "step": 85 }, { "epoch": 0.410958904109589, "grad_norm": 0.310546875, "learning_rate": 0.0001467436466746814, "loss": 1.2186, "step": 90 }, { "epoch": 0.4337899543378995, "grad_norm": 0.29296875, "learning_rate": 0.0001395537365535585, "loss": 1.2299, "step": 95 }, { "epoch": 0.45662100456621, "grad_norm": 0.29296875, "learning_rate": 0.00013211248463910262, "loss": 1.2363, "step": 100 }, { "epoch": 0.4794520547945205, "grad_norm": 0.357421875, "learning_rate": 0.00012446717591027624, "loss": 1.2224, "step": 105 }, { "epoch": 0.502283105022831, "grad_norm": 0.34765625, "learning_rate": 0.00011666639201255506, "loss": 1.2078, "step": 110 }, { "epoch": 0.5251141552511416, "grad_norm": 0.34375, "learning_rate": 0.0001087597025488413, "loss": 1.2071, "step": 115 }, { "epoch": 0.547945205479452, "grad_norm": 0.322265625, "learning_rate": 0.00010079735009246167, "loss": 1.2097, "step": 120 }, { "epoch": 0.5707762557077626, "grad_norm": 0.31640625, "learning_rate": 9.282993092381625e-05, "loss": 1.2525, "step": 125 }, { "epoch": 0.593607305936073, "grad_norm": 0.298828125, "learning_rate": 8.490807351941753e-05, "loss": 1.1924, "step": 130 }, { "epoch": 0.6164383561643836, "grad_norm": 0.375, "learning_rate": 7.708211683634112e-05, "loss": 1.2135, "step": 135 }, { "epoch": 0.639269406392694, "grad_norm": 0.39453125, "learning_rate": 6.940179043641005e-05, "loss": 1.191, "step": 140 }, { "epoch": 0.6621004566210046, "grad_norm": 0.3046875, "learning_rate": 6.191589848274368e-05, "loss": 1.2311, "step": 145 }, { "epoch": 0.684931506849315, "grad_norm": 0.30078125, "learning_rate": 5.467200961669619e-05, "loss": 1.2178, "step": 150 }, { "epoch": 0.7077625570776256, "grad_norm": 0.30859375, "learning_rate": 4.7716154685841944e-05, "loss": 1.1988, "step": 155 }, { "epoch": 0.730593607305936, "grad_norm": 0.32421875, "learning_rate": 4.109253424377772e-05, "loss": 1.2199, "step": 160 }, { "epoch": 0.7534246575342466, "grad_norm": 0.365234375, "learning_rate": 3.4843237680415156e-05, "loss": 1.1871, "step": 165 }, { "epoch": 0.776255707762557, "grad_norm": 0.30859375, "learning_rate": 2.9007975767533714e-05, "loss": 1.2023, "step": 170 }, { "epoch": 0.7990867579908676, "grad_norm": 0.349609375, "learning_rate": 2.3623828319116748e-05, "loss": 1.2127, "step": 175 }, { "epoch": 0.821917808219178, "grad_norm": 0.32421875, "learning_rate": 1.8725008569947365e-05, "loss": 1.2043, "step": 180 }, { "epoch": 0.8447488584474886, "grad_norm": 0.3046875, "learning_rate": 1.4342645769705977e-05, "loss": 1.2166, "step": 185 }, { "epoch": 0.867579908675799, "grad_norm": 0.291015625, "learning_rate": 1.0504587374062391e-05, "loss": 1.1974, "step": 190 }, { "epoch": 0.8904109589041096, "grad_norm": 0.322265625, "learning_rate": 7.235222089726279e-06, "loss": 1.1563, "step": 195 }, { "epoch": 0.91324200913242, "grad_norm": 0.298828125, "learning_rate": 4.555324897906132e-06, "loss": 1.1766, "step": 200 }, { "epoch": 0.9360730593607306, "grad_norm": 0.298828125, "learning_rate": 2.4819250409651607e-06, "loss": 1.2015, "step": 205 }, { "epoch": 0.958904109589041, "grad_norm": 0.314453125, "learning_rate": 1.0281978111449375e-06, "loss": 1.2226, "step": 210 }, { "epoch": 0.9817351598173516, "grad_norm": 0.3359375, "learning_rate": 2.0338082897886079e-07, "loss": 1.2156, "step": 215 }, { "epoch": 1.0, "eval_loss": 1.162326693534851, "eval_runtime": 431.6789, "eval_samples_per_second": 4.056, "eval_steps_per_second": 0.507, "step": 219 }, { "epoch": 1.0, "step": 219, "total_flos": 1.538974747113554e+17, "train_loss": 1.3291025118196391, "train_runtime": 2016.0456, "train_samples_per_second": 0.869, "train_steps_per_second": 0.109 } ], "logging_steps": 5, "max_steps": 219, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 1.538974747113554e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }