{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1631321370309951, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032626427406199023, "grad_norm": 0.774185299873352, "learning_rate": 5e-06, "loss": 44.318, "step": 1 }, { "epoch": 0.0032626427406199023, "eval_loss": 11.076703071594238, "eval_runtime": 1.2109, "eval_samples_per_second": 107.361, "eval_steps_per_second": 53.68, "step": 1 }, { "epoch": 0.0065252854812398045, "grad_norm": 0.8545265793800354, "learning_rate": 1e-05, "loss": 44.2985, "step": 2 }, { "epoch": 0.009787928221859706, "grad_norm": 0.8348920941352844, "learning_rate": 1.5e-05, "loss": 44.289, "step": 3 }, { "epoch": 0.013050570962479609, "grad_norm": 0.7924385070800781, "learning_rate": 2e-05, "loss": 44.308, "step": 4 }, { "epoch": 0.01631321370309951, "grad_norm": 0.8149838447570801, "learning_rate": 2.5e-05, "loss": 44.2981, "step": 5 }, { "epoch": 0.01957585644371941, "grad_norm": 0.8136796951293945, "learning_rate": 3e-05, "loss": 44.2935, "step": 6 }, { "epoch": 0.022838499184339316, "grad_norm": 0.8278170824050903, "learning_rate": 3.5e-05, "loss": 44.3029, "step": 7 }, { "epoch": 0.026101141924959218, "grad_norm": 0.8501989245414734, "learning_rate": 4e-05, "loss": 44.3039, "step": 8 }, { "epoch": 0.02936378466557912, "grad_norm": 0.8882483243942261, "learning_rate": 4.5e-05, "loss": 44.2868, "step": 9 }, { "epoch": 0.03262642740619902, "grad_norm": 0.8116857409477234, "learning_rate": 5e-05, "loss": 44.3318, "step": 10 }, { "epoch": 0.03588907014681892, "grad_norm": 0.8323069214820862, "learning_rate": 4.99229333433282e-05, "loss": 44.2835, "step": 11 }, { "epoch": 0.03915171288743882, "grad_norm": 0.8103060722351074, "learning_rate": 4.9692208514878444e-05, "loss": 44.2985, "step": 12 }, { "epoch": 0.04241435562805873, "grad_norm": 0.8131699562072754, "learning_rate": 4.9309248009941914e-05, "loss": 44.2883, "step": 13 }, { "epoch": 0.04241435562805873, "eval_loss": 11.075307846069336, "eval_runtime": 0.3243, "eval_samples_per_second": 400.855, "eval_steps_per_second": 200.428, "step": 13 }, { "epoch": 0.04567699836867863, "grad_norm": 0.8031054139137268, "learning_rate": 4.877641290737884e-05, "loss": 44.2919, "step": 14 }, { "epoch": 0.048939641109298535, "grad_norm": 0.7678930163383484, "learning_rate": 4.8096988312782174e-05, "loss": 44.3113, "step": 15 }, { "epoch": 0.052202283849918436, "grad_norm": 0.8228908777236938, "learning_rate": 4.72751631047092e-05, "loss": 44.2994, "step": 16 }, { "epoch": 0.05546492659053834, "grad_norm": 0.8377927541732788, "learning_rate": 4.6316004108852305e-05, "loss": 44.2779, "step": 17 }, { "epoch": 0.05872756933115824, "grad_norm": 0.8467262387275696, "learning_rate": 4.522542485937369e-05, "loss": 44.2685, "step": 18 }, { "epoch": 0.06199021207177814, "grad_norm": 0.8959646821022034, "learning_rate": 4.401014914000078e-05, "loss": 44.2928, "step": 19 }, { "epoch": 0.06525285481239804, "grad_norm": 0.7665279507637024, "learning_rate": 4.267766952966369e-05, "loss": 44.3061, "step": 20 }, { "epoch": 0.06851549755301795, "grad_norm": 0.8404932618141174, "learning_rate": 4.123620120825459e-05, "loss": 44.2792, "step": 21 }, { "epoch": 0.07177814029363784, "grad_norm": 0.7906339168548584, "learning_rate": 3.969463130731183e-05, "loss": 44.2955, "step": 22 }, { "epoch": 0.07504078303425775, "grad_norm": 0.8726968169212341, "learning_rate": 3.8062464117898724e-05, "loss": 44.2876, "step": 23 }, { "epoch": 0.07830342577487764, "grad_norm": 0.830900251865387, "learning_rate": 3.634976249348867e-05, "loss": 44.2961, "step": 24 }, { "epoch": 0.08156606851549755, "grad_norm": 0.8512407541275024, "learning_rate": 3.456708580912725e-05, "loss": 44.2854, "step": 25 }, { "epoch": 0.08482871125611746, "grad_norm": 0.8591986298561096, "learning_rate": 3.272542485937369e-05, "loss": 44.2909, "step": 26 }, { "epoch": 0.08482871125611746, "eval_loss": 11.072349548339844, "eval_runtime": 0.3215, "eval_samples_per_second": 404.336, "eval_steps_per_second": 202.168, "step": 26 }, { "epoch": 0.08809135399673736, "grad_norm": 0.7887829542160034, "learning_rate": 3.083613409639764e-05, "loss": 44.2948, "step": 27 }, { "epoch": 0.09135399673735727, "grad_norm": 0.8105373978614807, "learning_rate": 2.8910861626005776e-05, "loss": 44.2921, "step": 28 }, { "epoch": 0.09461663947797716, "grad_norm": 0.8155120015144348, "learning_rate": 2.6961477393196126e-05, "loss": 44.2903, "step": 29 }, { "epoch": 0.09787928221859707, "grad_norm": 0.8712279796600342, "learning_rate": 2.5e-05, "loss": 44.2978, "step": 30 }, { "epoch": 0.10114192495921696, "grad_norm": 0.8829441070556641, "learning_rate": 2.303852260680388e-05, "loss": 44.2782, "step": 31 }, { "epoch": 0.10440456769983687, "grad_norm": 0.8893045783042908, "learning_rate": 2.1089138373994223e-05, "loss": 44.295, "step": 32 }, { "epoch": 0.10766721044045677, "grad_norm": 0.7750392556190491, "learning_rate": 1.9163865903602374e-05, "loss": 44.2962, "step": 33 }, { "epoch": 0.11092985318107668, "grad_norm": 0.8300427794456482, "learning_rate": 1.7274575140626318e-05, "loss": 44.2627, "step": 34 }, { "epoch": 0.11419249592169657, "grad_norm": 0.8020292520523071, "learning_rate": 1.5432914190872757e-05, "loss": 44.291, "step": 35 }, { "epoch": 0.11745513866231648, "grad_norm": 0.7964890003204346, "learning_rate": 1.3650237506511331e-05, "loss": 44.2862, "step": 36 }, { "epoch": 0.12071778140293637, "grad_norm": 0.8006289601325989, "learning_rate": 1.1937535882101281e-05, "loss": 44.2753, "step": 37 }, { "epoch": 0.12398042414355628, "grad_norm": 0.8699887990951538, "learning_rate": 1.0305368692688174e-05, "loss": 44.2691, "step": 38 }, { "epoch": 0.1272430668841762, "grad_norm": 0.84561687707901, "learning_rate": 8.763798791745411e-06, "loss": 44.2795, "step": 39 }, { "epoch": 0.1272430668841762, "eval_loss": 11.0687837600708, "eval_runtime": 0.3224, "eval_samples_per_second": 403.278, "eval_steps_per_second": 201.639, "step": 39 }, { "epoch": 0.13050570962479607, "grad_norm": 0.9297838807106018, "learning_rate": 7.3223304703363135e-06, "loss": 44.2678, "step": 40 }, { "epoch": 0.13376835236541598, "grad_norm": 0.8666333556175232, "learning_rate": 5.989850859999227e-06, "loss": 44.2706, "step": 41 }, { "epoch": 0.1370309951060359, "grad_norm": 0.8372588753700256, "learning_rate": 4.7745751406263165e-06, "loss": 44.2717, "step": 42 }, { "epoch": 0.1402936378466558, "grad_norm": 0.8530192971229553, "learning_rate": 3.6839958911476957e-06, "loss": 44.2512, "step": 43 }, { "epoch": 0.14355628058727568, "grad_norm": 0.8691236972808838, "learning_rate": 2.7248368952908053e-06, "loss": 44.2779, "step": 44 }, { "epoch": 0.1468189233278956, "grad_norm": 0.8216147422790527, "learning_rate": 1.9030116872178316e-06, "loss": 44.2793, "step": 45 }, { "epoch": 0.1500815660685155, "grad_norm": 0.8256657123565674, "learning_rate": 1.2235870926211619e-06, "loss": 44.2719, "step": 46 }, { "epoch": 0.1533442088091354, "grad_norm": 0.8734961152076721, "learning_rate": 6.907519900580861e-07, "loss": 44.2613, "step": 47 }, { "epoch": 0.1566068515497553, "grad_norm": 0.7690317034721375, "learning_rate": 3.077914851215585e-07, "loss": 44.2756, "step": 48 }, { "epoch": 0.1598694942903752, "grad_norm": 0.851801335811615, "learning_rate": 7.706665667180091e-08, "loss": 44.2664, "step": 49 }, { "epoch": 0.1631321370309951, "grad_norm": 0.8386527895927429, "learning_rate": 0.0, "loss": 44.2724, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 42958848000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }