{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9978431433840766, "eval_steps": 500, "global_step": 2997, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001000281329123816, "grad_norm": 62.25, "learning_rate": 6.666666666666668e-08, "loss": 2.3405, "step": 1 }, { "epoch": 0.0500140664561908, "grad_norm": 6.875, "learning_rate": 3.3333333333333333e-06, "loss": 1.3787, "step": 50 }, { "epoch": 0.1000281329123816, "grad_norm": 3.9375, "learning_rate": 6.666666666666667e-06, "loss": 0.6171, "step": 100 }, { "epoch": 0.1500421993685724, "grad_norm": 3.703125, "learning_rate": 1e-05, "loss": 0.5743, "step": 150 }, { "epoch": 0.2000562658247632, "grad_norm": 2.96875, "learning_rate": 1.3333333333333333e-05, "loss": 0.5609, "step": 200 }, { "epoch": 0.25007033228095404, "grad_norm": 3.171875, "learning_rate": 1.6666666666666667e-05, "loss": 0.6189, "step": 250 }, { "epoch": 0.3000843987371448, "grad_norm": 5.90625, "learning_rate": 2e-05, "loss": 0.5975, "step": 300 }, { "epoch": 0.35009846519333565, "grad_norm": 2.140625, "learning_rate": 1.9983043934122208e-05, "loss": 0.5851, "step": 350 }, { "epoch": 0.4001125316495264, "grad_norm": 2.671875, "learning_rate": 1.9932233238122834e-05, "loss": 0.567, "step": 400 }, { "epoch": 0.45012659810571726, "grad_norm": 2.046875, "learning_rate": 1.984774022190361e-05, "loss": 0.5503, "step": 450 }, { "epoch": 0.5001406645619081, "grad_norm": 1.734375, "learning_rate": 1.972985141929439e-05, "loss": 0.5386, "step": 500 }, { "epoch": 0.5501547310180989, "grad_norm": 1.8203125, "learning_rate": 1.9578966616355823e-05, "loss": 0.5262, "step": 550 }, { "epoch": 0.6001687974742896, "grad_norm": 1.921875, "learning_rate": 1.9395597495619634e-05, "loss": 0.5219, "step": 600 }, { "epoch": 0.6501828639304804, "grad_norm": 1.6875, "learning_rate": 1.918036590086405e-05, "loss": 0.5065, "step": 650 }, { "epoch": 0.7001969303866713, "grad_norm": 1.703125, "learning_rate": 1.8934001728309003e-05, "loss": 0.5041, "step": 700 }, { "epoch": 0.7502109968428621, "grad_norm": 1.7734375, "learning_rate": 1.865734045138245e-05, "loss": 0.4946, "step": 750 }, { "epoch": 0.8002250632990529, "grad_norm": 1.5234375, "learning_rate": 1.8351320287451865e-05, "loss": 0.4906, "step": 800 }, { "epoch": 0.8502391297552436, "grad_norm": 1.5078125, "learning_rate": 1.8016979016129164e-05, "loss": 0.4824, "step": 850 }, { "epoch": 0.9002531962114345, "grad_norm": 1.4375, "learning_rate": 1.7655450459938786e-05, "loss": 0.4736, "step": 900 }, { "epoch": 0.9502672626676253, "grad_norm": 1.5234375, "learning_rate": 1.726796063928382e-05, "loss": 0.4677, "step": 950 }, { "epoch": 1.0002813291238162, "grad_norm": 1.421875, "learning_rate": 1.6855823614749474e-05, "loss": 0.4654, "step": 1000 }, { "epoch": 1.050295395580007, "grad_norm": 1.453125, "learning_rate": 1.6420437030843482e-05, "loss": 0.3223, "step": 1050 }, { "epoch": 1.1003094620361977, "grad_norm": 1.375, "learning_rate": 1.5963277376285646e-05, "loss": 0.319, "step": 1100 }, { "epoch": 1.1503235284923885, "grad_norm": 1.2578125, "learning_rate": 1.5485894976919836e-05, "loss": 0.3218, "step": 1150 }, { "epoch": 1.2003375949485793, "grad_norm": 1.3984375, "learning_rate": 1.4989908738228567e-05, "loss": 0.3174, "step": 1200 }, { "epoch": 1.25035166140477, "grad_norm": 1.4453125, "learning_rate": 1.4477000655279376e-05, "loss": 0.3185, "step": 1250 }, { "epoch": 1.3003657278609608, "grad_norm": 1.5625, "learning_rate": 1.394891010872102e-05, "loss": 0.3153, "step": 1300 }, { "epoch": 1.3503797943171518, "grad_norm": 1.4375, "learning_rate": 1.3407427966172866e-05, "loss": 0.3161, "step": 1350 }, { "epoch": 1.4003938607733426, "grad_norm": 1.359375, "learning_rate": 1.2854390509011061e-05, "loss": 0.3117, "step": 1400 }, { "epoch": 1.4504079272295334, "grad_norm": 1.375, "learning_rate": 1.2291673205146908e-05, "loss": 0.307, "step": 1450 }, { "epoch": 1.5004219936857242, "grad_norm": 1.3046875, "learning_rate": 1.1721184348915384e-05, "loss": 0.3057, "step": 1500 }, { "epoch": 1.550436060141915, "grad_norm": 1.2890625, "learning_rate": 1.1144858589642251e-05, "loss": 0.3023, "step": 1550 }, { "epoch": 1.6004501265981057, "grad_norm": 1.34375, "learning_rate": 1.0564650370835772e-05, "loss": 0.2997, "step": 1600 }, { "epoch": 1.6504641930542965, "grad_norm": 1.3125, "learning_rate": 9.982527302252135e-06, "loss": 0.2989, "step": 1650 }, { "epoch": 1.7004782595104873, "grad_norm": 1.390625, "learning_rate": 9.40046348731131e-06, "loss": 0.2947, "step": 1700 }, { "epoch": 1.750492325966678, "grad_norm": 1.40625, "learning_rate": 8.820432828491542e-06, "loss": 0.2935, "step": 1750 }, { "epoch": 1.8005063924228688, "grad_norm": 1.359375, "learning_rate": 8.244402333405252e-06, "loss": 0.289, "step": 1800 }, { "epoch": 1.8505204588790596, "grad_norm": 1.40625, "learning_rate": 7.674325444256899e-06, "loss": 0.2874, "step": 1850 }, { "epoch": 1.9005345253352506, "grad_norm": 1.40625, "learning_rate": 7.112135413304042e-06, "loss": 0.2839, "step": 1900 }, { "epoch": 1.9505485917914414, "grad_norm": 1.2421875, "learning_rate": 6.55973874678682e-06, "loss": 0.2832, "step": 1950 }, { "epoch": 2.0005626582476324, "grad_norm": 1.15625, "learning_rate": 6.0190087395588596e-06, "loss": 0.2765, "step": 2000 }, { "epoch": 2.050576724703823, "grad_norm": 1.2578125, "learning_rate": 5.491779122345093e-06, "loss": 0.1509, "step": 2050 }, { "epoch": 2.100590791160014, "grad_norm": 1.203125, "learning_rate": 4.979837843169959e-06, "loss": 0.1491, "step": 2100 }, { "epoch": 2.1506048576162047, "grad_norm": 1.296875, "learning_rate": 4.484921004044509e-06, "loss": 0.149, "step": 2150 }, { "epoch": 2.2006189240723955, "grad_norm": 1.1875, "learning_rate": 4.008706973474391e-06, "loss": 0.1492, "step": 2200 }, { "epoch": 2.2506329905285862, "grad_norm": 1.1640625, "learning_rate": 3.5528106947544626e-06, "loss": 0.1477, "step": 2250 }, { "epoch": 2.300647056984777, "grad_norm": 1.2421875, "learning_rate": 3.118778209351808e-06, "loss": 0.1478, "step": 2300 }, { "epoch": 2.350661123440968, "grad_norm": 1.3046875, "learning_rate": 2.7080814139495402e-06, "loss": 0.1471, "step": 2350 }, { "epoch": 2.4006751898971586, "grad_norm": 1.265625, "learning_rate": 2.322113068931391e-06, "loss": 0.1472, "step": 2400 }, { "epoch": 2.4506892563533493, "grad_norm": 1.2421875, "learning_rate": 1.9621820752343324e-06, "loss": 0.1467, "step": 2450 }, { "epoch": 2.50070332280954, "grad_norm": 1.1875, "learning_rate": 1.629509035586484e-06, "loss": 0.1449, "step": 2500 }, { "epoch": 2.550717389265731, "grad_norm": 1.28125, "learning_rate": 1.3252221151830513e-06, "loss": 0.1457, "step": 2550 }, { "epoch": 2.6007314557219217, "grad_norm": 1.2890625, "learning_rate": 1.0503532158376584e-06, "loss": 0.1447, "step": 2600 }, { "epoch": 2.6507455221781124, "grad_norm": 1.28125, "learning_rate": 8.058344765833171e-07, "loss": 0.146, "step": 2650 }, { "epoch": 2.7007595886343037, "grad_norm": 1.3359375, "learning_rate": 5.924951125902545e-07, "loss": 0.1461, "step": 2700 }, { "epoch": 2.7507736550904944, "grad_norm": 1.1875, "learning_rate": 4.11058603120511e-07, "loss": 0.144, "step": 2750 }, { "epoch": 2.800787721546685, "grad_norm": 1.171875, "learning_rate": 2.6214023805552826e-07, "loss": 0.1447, "step": 2800 }, { "epoch": 2.850801788002876, "grad_norm": 1.28125, "learning_rate": 1.462450313169983e-07, "loss": 0.1443, "step": 2850 }, { "epoch": 2.9008158544590668, "grad_norm": 1.2421875, "learning_rate": 6.376600825699463e-08, "loss": 0.1443, "step": 2900 }, { "epoch": 2.9508299209152575, "grad_norm": 1.21875, "learning_rate": 1.49828728252277e-08, "loss": 0.1456, "step": 2950 }, { "epoch": 2.9978431433840766, "step": 2997, "total_flos": 1.9278929080237425e+18, "train_loss": 0.3424536967062735, "train_runtime": 31340.7737, "train_samples_per_second": 6.124, "train_steps_per_second": 0.096 } ], "logging_steps": 50, "max_steps": 2997, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 1.9278929080237425e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }