|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9814460437640404, |
|
"eval_steps": 47, |
|
"global_step": 374, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005324902238122972, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.4285714285714286e-06, |
|
"loss": 1.7903, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005324902238122972, |
|
"eval_loss": 1.775637149810791, |
|
"eval_runtime": 77.1748, |
|
"eval_samples_per_second": 15.29, |
|
"eval_steps_per_second": 15.29, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010649804476245944, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 1.761, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.015974706714368916, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 4.2857142857142855e-06, |
|
"loss": 1.8033, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02129960895249189, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 1.7335, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02662451119061486, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 1.7532, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03194941342873783, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 1.7546, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.037274315666860805, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1e-05, |
|
"loss": 1.8068, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.04259921790498378, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 1.1428571428571429e-05, |
|
"loss": 1.8295, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04792412014310675, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.2857142857142859e-05, |
|
"loss": 1.7343, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05324902238122972, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.4285714285714287e-05, |
|
"loss": 1.8318, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.058573924619352694, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 1.5714285714285715e-05, |
|
"loss": 1.8064, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.06389882685747567, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.7142857142857142e-05, |
|
"loss": 1.7317, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.06922372909559864, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.8571428571428575e-05, |
|
"loss": 1.8281, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.07454863133372161, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2e-05, |
|
"loss": 1.6915, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.07987353357184458, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.9999619230641714e-05, |
|
"loss": 1.7509, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08519843580996755, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.9998476951563914e-05, |
|
"loss": 1.7416, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.09052333804809053, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.9996573249755573e-05, |
|
"loss": 1.7, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0958482402862135, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 1.999390827019096e-05, |
|
"loss": 1.771, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.10117314252433647, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.999048221581858e-05, |
|
"loss": 1.7097, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.10649804476245944, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.9986295347545738e-05, |
|
"loss": 1.7411, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11182294700058241, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.998134798421867e-05, |
|
"loss": 1.7664, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.11714784923870539, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 1.7283, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.12247275147682836, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.9969173337331283e-05, |
|
"loss": 1.8261, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.12779765371495133, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.9961946980917457e-05, |
|
"loss": 1.7096, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1331225559530743, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.9953961983671792e-05, |
|
"loss": 1.8025, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.13844745819119728, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.9945218953682736e-05, |
|
"loss": 1.7506, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.14377236042932023, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.9935718556765878e-05, |
|
"loss": 1.7492, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.14909726266744322, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.9925461516413224e-05, |
|
"loss": 1.7108, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.15442216490556618, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.9914448613738107e-05, |
|
"loss": 1.6542, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.15974706714368916, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.9902680687415704e-05, |
|
"loss": 1.7548, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16507196938181212, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.989015863361917e-05, |
|
"loss": 1.7417, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1703968716199351, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.9876883405951378e-05, |
|
"loss": 1.6839, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.17572177385805807, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.9862856015372315e-05, |
|
"loss": 1.6547, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.18104667609618105, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.9848077530122083e-05, |
|
"loss": 1.7696, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.186371578334304, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.983254907563955e-05, |
|
"loss": 1.6861, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.191696480572427, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.9816271834476642e-05, |
|
"loss": 1.7726, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.19702138281054996, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.9799247046208297e-05, |
|
"loss": 1.7046, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.20234628504867294, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 1.7071, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2076711872867959, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.9762960071199334e-05, |
|
"loss": 1.8033, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.21299608952491889, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.9743700647852356e-05, |
|
"loss": 1.8173, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.21832099176304184, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.9723699203976768e-05, |
|
"loss": 1.7546, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.22364589400116483, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.9702957262759964e-05, |
|
"loss": 1.7599, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2289707962392878, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.968147640378108e-05, |
|
"loss": 1.6765, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.23429569847741077, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.9659258262890683e-05, |
|
"loss": 1.7651, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.23962060071553373, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.963630453208623e-05, |
|
"loss": 1.6026, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.24494550295365672, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.961261695938319e-05, |
|
"loss": 1.7446, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2502704051917797, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.958819734868193e-05, |
|
"loss": 1.7374, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.2502704051917797, |
|
"eval_loss": 1.7246959209442139, |
|
"eval_runtime": 84.4189, |
|
"eval_samples_per_second": 13.978, |
|
"eval_steps_per_second": 13.978, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.25559530742990266, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.9563047559630356e-05, |
|
"loss": 1.7224, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.26092020966802565, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.953716950748227e-05, |
|
"loss": 1.7919, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2662451119061486, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.9510565162951538e-05, |
|
"loss": 1.728, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.27157001414427157, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 1.9483236552061996e-05, |
|
"loss": 1.758, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.27689491638239455, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.945518575599317e-05, |
|
"loss": 1.6651, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.28221981862051754, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.9426414910921785e-05, |
|
"loss": 1.7097, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.28754472085864047, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 1.7315, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.29286962309676345, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.9366721892483976e-05, |
|
"loss": 1.7535, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.29819452533488644, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.9335804264972018e-05, |
|
"loss": 1.7806, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3035194275730094, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.9304175679820247e-05, |
|
"loss": 1.7565, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.30884432981113236, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.9271838545667876e-05, |
|
"loss": 1.7056, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.31416923204925534, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.9238795325112867e-05, |
|
"loss": 1.7266, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.31949413428737833, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.9205048534524405e-05, |
|
"loss": 1.6798, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3248190365255013, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.917060074385124e-05, |
|
"loss": 1.6847, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.33014393876362425, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.913545457642601e-05, |
|
"loss": 1.6441, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.33546884100174723, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.9099612708765432e-05, |
|
"loss": 1.6839, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.3407937432398702, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.9063077870366504e-05, |
|
"loss": 1.6653, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.3461186454779932, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.902585284349861e-05, |
|
"loss": 1.6688, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.35144354771611613, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.8987940462991673e-05, |
|
"loss": 1.6317, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.3567684499542391, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 1.894934361602025e-05, |
|
"loss": 1.6889, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.3620933521923621, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.891006524188368e-05, |
|
"loss": 1.7744, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.3674182544304851, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 1.887010833178222e-05, |
|
"loss": 1.7525, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.372743156668608, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.8829475928589272e-05, |
|
"loss": 1.5786, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.378068058906731, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 1.8788171126619653e-05, |
|
"loss": 1.7715, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.383392961144854, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.874619707139396e-05, |
|
"loss": 1.6526, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.388717863382977, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.8703556959398998e-05, |
|
"loss": 1.7511, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.3940427656210999, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.866025403784439e-05, |
|
"loss": 1.789, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3993676678592229, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.861629160441526e-05, |
|
"loss": 1.7705, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4046925700973459, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.8571673007021124e-05, |
|
"loss": 1.7042, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4100174723354688, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 1.8526401643540924e-05, |
|
"loss": 1.6971, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.4153423745735918, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 1.848048096156426e-05, |
|
"loss": 1.6873, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.4206672768117148, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 1.843391445812886e-05, |
|
"loss": 1.6937, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.42599217904983777, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.8386705679454243e-05, |
|
"loss": 1.6798, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4313170812879607, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.8338858220671683e-05, |
|
"loss": 1.7316, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.4366419835260837, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.8290375725550417e-05, |
|
"loss": 1.6617, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.4419668857642067, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.8241261886220155e-05, |
|
"loss": 1.7817, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.44729178800232966, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.819152044288992e-05, |
|
"loss": 1.7649, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.4526166902404526, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.8141155183563195e-05, |
|
"loss": 1.7158, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.4579415924785756, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 1.7345, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.46326649471669856, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.8038568606172172e-05, |
|
"loss": 1.6563, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.46859139695482155, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.798635510047293e-05, |
|
"loss": 1.726, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.4739162991929445, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.7933533402912354e-05, |
|
"loss": 1.6767, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.47924120143106747, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.788010753606722e-05, |
|
"loss": 1.6585, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.48456610366919045, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 1.782608156852414e-05, |
|
"loss": 1.6202, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.48989100590731344, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.777145961456971e-05, |
|
"loss": 1.7857, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.49521590814543637, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.7716245833877202e-05, |
|
"loss": 1.7479, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5005408103835594, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 1.7516, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5005408103835594, |
|
"eval_loss": 1.7072687149047852, |
|
"eval_runtime": 77.7379, |
|
"eval_samples_per_second": 15.179, |
|
"eval_steps_per_second": 15.179, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5058657126216823, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.7604059656000313e-05, |
|
"loss": 1.6983, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5111906148598053, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.7547095802227723e-05, |
|
"loss": 1.7144, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5165155170979283, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.7489557207890025e-05, |
|
"loss": 1.754, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5218404193360513, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.7431448254773943e-05, |
|
"loss": 1.7862, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.5271653215741742, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.737277336810124e-05, |
|
"loss": 1.7244, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5324902238122972, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.7313537016191706e-05, |
|
"loss": 1.7129, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5378151260504201, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.7253743710122877e-05, |
|
"loss": 1.6302, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.5431400282885431, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 1.7193398003386514e-05, |
|
"loss": 1.6831, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.5484649305266661, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.713250449154182e-05, |
|
"loss": 1.74, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.5537898327647891, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.7071067811865477e-05, |
|
"loss": 1.7333, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.5591147350029121, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.700909264299851e-05, |
|
"loss": 1.7359, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5644396372410351, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 1.6946583704589973e-05, |
|
"loss": 1.7432, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.569764539479158, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.688354575693754e-05, |
|
"loss": 1.6974, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.5750894417172809, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.6819983600624986e-05, |
|
"loss": 1.7011, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.5804143439554039, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.6755902076156606e-05, |
|
"loss": 1.665, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.5857392461935269, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.6691306063588583e-05, |
|
"loss": 1.8047, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5910641484316499, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.6626200482157378e-05, |
|
"loss": 1.6664, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.5963890506697729, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.6560590289905074e-05, |
|
"loss": 1.7291, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6017139529078959, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.6494480483301836e-05, |
|
"loss": 1.709, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.6070388551460189, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.6427876096865394e-05, |
|
"loss": 1.7286, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.6123637573841417, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.636078220277764e-05, |
|
"loss": 1.6269, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6176886596222647, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.6293203910498375e-05, |
|
"loss": 1.6286, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.6230135618603877, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.6225146366376198e-05, |
|
"loss": 1.78, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.6283384640985107, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.6156614753256583e-05, |
|
"loss": 1.7336, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.6336633663366337, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.608761429008721e-05, |
|
"loss": 1.6578, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.6389882685747567, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.6018150231520486e-05, |
|
"loss": 1.6736, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6443131708128796, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.5948227867513416e-05, |
|
"loss": 1.5976, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.6496380730510026, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.5877852522924733e-05, |
|
"loss": 1.7155, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.6549629752891255, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.5807029557109398e-05, |
|
"loss": 1.6595, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.6602878775272485, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.573576436351046e-05, |
|
"loss": 1.7354, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.6656127797653715, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.566406236924833e-05, |
|
"loss": 1.7401, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6709376820034945, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.5591929034707468e-05, |
|
"loss": 1.6774, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.6762625842416174, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.5519369853120584e-05, |
|
"loss": 1.6818, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.6815874864797404, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.5446390350150272e-05, |
|
"loss": 1.598, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.6869123887178634, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.5372996083468242e-05, |
|
"loss": 1.7103, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.6922372909559864, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.529919264233205e-05, |
|
"loss": 1.6196, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6975621931941093, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.5224985647159489e-05, |
|
"loss": 1.618, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.7028870954322323, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.5150380749100545e-05, |
|
"loss": 1.7159, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.7082119976703553, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.5075383629607043e-05, |
|
"loss": 1.6372, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.7135368999084782, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 1.7368, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.7188618021466012, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 1.4924235601034673e-05, |
|
"loss": 1.6675, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7241867043847242, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.4848096202463373e-05, |
|
"loss": 1.6651, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.7295116066228472, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.4771587602596085e-05, |
|
"loss": 1.6842, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.7348365088609702, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.469471562785891e-05, |
|
"loss": 1.682, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.7401614110990931, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.4617486132350343e-05, |
|
"loss": 1.697, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.745486313337216, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.4539904997395468e-05, |
|
"loss": 1.6511, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.750811215575339, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.4461978131098089e-05, |
|
"loss": 1.6586, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.750811215575339, |
|
"eval_loss": 1.6974835395812988, |
|
"eval_runtime": 77.2891, |
|
"eval_samples_per_second": 15.267, |
|
"eval_steps_per_second": 15.267, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.756136117813462, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.4383711467890776e-05, |
|
"loss": 1.7338, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.761461020051585, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.4305110968082953e-05, |
|
"loss": 1.6623, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.766785922289708, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.4226182617406996e-05, |
|
"loss": 1.6748, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.772110824527831, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.4146932426562391e-05, |
|
"loss": 1.7057, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.777435726765954, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.4067366430758004e-05, |
|
"loss": 1.6403, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.7827606290040768, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.3987490689252463e-05, |
|
"loss": 1.7262, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.7880855312421998, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.3907311284892737e-05, |
|
"loss": 1.7561, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.7934104334803228, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 1.3826834323650899e-05, |
|
"loss": 1.6847, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.7987353357184458, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.3746065934159123e-05, |
|
"loss": 1.5979, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8040602379565688, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.3665012267242974e-05, |
|
"loss": 1.6537, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.8093851401946918, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.3583679495453e-05, |
|
"loss": 1.7491, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.8147100424328148, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.3502073812594677e-05, |
|
"loss": 1.6909, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.8200349446709376, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.342020143325669e-05, |
|
"loss": 1.6917, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.8253598469090606, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 1.333806859233771e-05, |
|
"loss": 1.692, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.8306847491471836, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.3255681544571568e-05, |
|
"loss": 1.6995, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.8360096513853066, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.3173046564050923e-05, |
|
"loss": 1.6612, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.8413345536234296, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.3090169943749475e-05, |
|
"loss": 1.7376, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.8466594558615526, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.300705799504273e-05, |
|
"loss": 1.6703, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.8519843580996755, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.2923717047227368e-05, |
|
"loss": 1.6611, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8573092603377985, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 1.284015344703923e-05, |
|
"loss": 1.6215, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.8626341625759214, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.2756373558169992e-05, |
|
"loss": 1.7199, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.8679590648140444, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.267238376078257e-05, |
|
"loss": 1.6418, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.8732839670521674, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 1.2588190451025209e-05, |
|
"loss": 1.6476, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.8786088692902904, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 1.2503800040544417e-05, |
|
"loss": 1.6088, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8839337715284133, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.2419218955996677e-05, |
|
"loss": 1.6919, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.8892586737665363, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.2334453638559057e-05, |
|
"loss": 1.6118, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.8945835760046593, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.2249510543438652e-05, |
|
"loss": 1.7246, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.8999084782427823, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.2164396139381029e-05, |
|
"loss": 1.6821, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.9052333804809052, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.2079116908177592e-05, |
|
"loss": 1.6474, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9105582827190282, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.1993679344171973e-05, |
|
"loss": 1.7538, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.9158831849571512, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.190808995376545e-05, |
|
"loss": 1.6299, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.9212080871952741, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.1822355254921478e-05, |
|
"loss": 1.6671, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.9265329894333971, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 1.6996, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.9318578916715201, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.1650476058606776e-05, |
|
"loss": 1.6863, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.9371827939096431, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.156434465040231e-05, |
|
"loss": 1.7013, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.9425076961477661, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.1478094111296109e-05, |
|
"loss": 1.7416, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.947832598385889, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.1391731009600655e-05, |
|
"loss": 1.7964, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.9531575006240119, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.130526192220052e-05, |
|
"loss": 1.761, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.9584824028621349, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.1218693434051475e-05, |
|
"loss": 1.6999, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9638073051002579, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.113203213767907e-05, |
|
"loss": 1.69, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.9691322073383809, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 1.1045284632676535e-05, |
|
"loss": 1.6777, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.9744571095765039, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.0958457525202241e-05, |
|
"loss": 1.6792, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.9797820118146269, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.0871557427476585e-05, |
|
"loss": 1.6724, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.9851069140527499, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.0784590957278452e-05, |
|
"loss": 1.7415, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.9904318162908727, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.0697564737441254e-05, |
|
"loss": 1.6796, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.9957567185289957, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.0610485395348571e-05, |
|
"loss": 1.6186, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.0010816207671187, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 1.0523359562429441e-05, |
|
"loss": 1.6741, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.0010816207671187, |
|
"eval_loss": 1.6922072172164917, |
|
"eval_runtime": 77.3424, |
|
"eval_samples_per_second": 15.257, |
|
"eval_steps_per_second": 15.257, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.0064065230052417, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.0436193873653362e-05, |
|
"loss": 1.7202, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.0016640319494134, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.0348994967025012e-05, |
|
"loss": 1.639, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0069889341875364, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.0261769483078734e-05, |
|
"loss": 1.6223, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.0123138364256594, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.0174524064372837e-05, |
|
"loss": 1.7193, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.0176387386637824, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.008726535498374e-05, |
|
"loss": 1.6904, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.0229636409019054, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6921, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.0282885431400284, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 9.912734645016262e-06, |
|
"loss": 1.6593, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.0336134453781514, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 9.825475935627165e-06, |
|
"loss": 1.6469, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.0389383476162741, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 9.738230516921272e-06, |
|
"loss": 1.5877, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.0442632498543971, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 9.651005032974994e-06, |
|
"loss": 1.6726, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.04958815209252, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.563806126346643e-06, |
|
"loss": 1.6607, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.054913054330643, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 9.476640437570562e-06, |
|
"loss": 1.6926, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.060237956568766, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 9.38951460465143e-06, |
|
"loss": 1.6241, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.065562858806889, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 9.302435262558748e-06, |
|
"loss": 1.6431, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.070887761045012, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 9.215409042721553e-06, |
|
"loss": 1.6339, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.076212663283135, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.128442572523418e-06, |
|
"loss": 1.6701, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.081537565521258, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 9.04154247479776e-06, |
|
"loss": 1.6059, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.086862467759381, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 8.954715367323468e-06, |
|
"loss": 1.6638, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.092187369997504, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 8.867967862320935e-06, |
|
"loss": 1.6428, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.097512272235627, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 8.781306565948528e-06, |
|
"loss": 1.6424, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.10283717447375, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 8.694738077799487e-06, |
|
"loss": 1.6105, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.108162076711873, |
|
"grad_norm": 1.0, |
|
"learning_rate": 8.60826899039935e-06, |
|
"loss": 1.7068, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.113486978949996, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 8.521905888703894e-06, |
|
"loss": 1.7118, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.118811881188119, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 8.43565534959769e-06, |
|
"loss": 1.6553, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.1241367834262417, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 8.349523941393224e-06, |
|
"loss": 1.6909, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.1294616856643647, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 1.7384, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.1347865879024877, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 8.177644745078525e-06, |
|
"loss": 1.6182, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.1401114901406106, |
|
"grad_norm": 1.125, |
|
"learning_rate": 8.091910046234552e-06, |
|
"loss": 1.7063, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.1454363923787336, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 8.00632065582803e-06, |
|
"loss": 1.6621, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.1507612946168566, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 7.92088309182241e-06, |
|
"loss": 1.7231, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.1560861968549796, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 7.835603860618973e-06, |
|
"loss": 1.6707, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.1614110990931026, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 7.750489456561351e-06, |
|
"loss": 1.6226, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.1667360013312256, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.66554636144095e-06, |
|
"loss": 1.6862, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.1720609035693486, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 7.580781044003324e-06, |
|
"loss": 1.7325, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.1773858058074715, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 7.496199959455584e-06, |
|
"loss": 1.6573, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.1827107080455945, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 7.411809548974792e-06, |
|
"loss": 1.6594, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.1880356102837175, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 7.327616239217432e-06, |
|
"loss": 1.6523, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.1933605125218405, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.243626441830009e-06, |
|
"loss": 1.6895, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.1986854147599635, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 7.159846552960774e-06, |
|
"loss": 1.6883, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.2040103169980862, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 7.076282952772634e-06, |
|
"loss": 1.5706, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.2093352192362095, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 6.992942004957271e-06, |
|
"loss": 1.5938, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.2146601214743322, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 1.6963, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2199850237124552, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 6.826953435949081e-06, |
|
"loss": 1.6565, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.2253099259505782, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 6.744318455428436e-06, |
|
"loss": 1.657, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.2306348281887012, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 6.661931407662292e-06, |
|
"loss": 1.6942, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.2359597304268242, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 6.579798566743314e-06, |
|
"loss": 1.6247, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.2412846326649472, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 6.497926187405326e-06, |
|
"loss": 1.6845, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.2412846326649472, |
|
"eval_loss": 1.6904007196426392, |
|
"eval_runtime": 77.2338, |
|
"eval_samples_per_second": 15.278, |
|
"eval_steps_per_second": 15.278, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.2466095349030701, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 6.4163205045469975e-06, |
|
"loss": 1.6958, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.2519344371411931, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 6.334987732757028e-06, |
|
"loss": 1.589, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.257259339379316, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 6.25393406584088e-06, |
|
"loss": 1.6417, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.262584241617439, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 6.173165676349103e-06, |
|
"loss": 1.6412, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.267909143855562, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 6.092688715107265e-06, |
|
"loss": 1.6539, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.273234046093685, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 6.0125093107475385e-06, |
|
"loss": 1.6194, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.278558948331808, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 5.932633569242e-06, |
|
"loss": 1.6566, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.2838838505699308, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 5.853067573437612e-06, |
|
"loss": 1.6734, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.289208752808054, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 5.773817382593008e-06, |
|
"loss": 1.7109, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.2945336550461768, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 5.694889031917047e-06, |
|
"loss": 1.5984, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.2998585572842998, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 5.616288532109225e-06, |
|
"loss": 1.7059, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.3051834595224228, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 5.5380218689019125e-06, |
|
"loss": 1.6666, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.3105083617605457, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 5.460095002604533e-06, |
|
"loss": 1.6752, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.3158332639986687, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 5.382513867649663e-06, |
|
"loss": 1.684, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.3211581662367917, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 5.305284372141095e-06, |
|
"loss": 1.6072, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3264830684749147, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 5.228412397403916e-06, |
|
"loss": 1.4978, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.3318079707130377, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 5.151903797536631e-06, |
|
"loss": 1.6164, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.3371328729511607, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 5.075764398965331e-06, |
|
"loss": 1.5904, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.3424577751892837, |
|
"grad_norm": 1.0, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 1.5924, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.3477826774274067, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.924616370392962e-06, |
|
"loss": 1.6521, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.3531075796655296, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.849619250899458e-06, |
|
"loss": 1.6124, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.3584324819036526, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.775014352840512e-06, |
|
"loss": 1.6634, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.3637573841417754, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 4.700807357667953e-06, |
|
"loss": 1.628, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.3690822863798986, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.627003916531761e-06, |
|
"loss": 1.7282, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.3744071886180214, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.5536096498497295e-06, |
|
"loss": 1.7081, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.3797320908561446, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.480630146879419e-06, |
|
"loss": 1.6772, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.3850569930942673, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.408070965292534e-06, |
|
"loss": 1.695, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.3903818953323903, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 4.335937630751675e-06, |
|
"loss": 1.6414, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.3957067975705133, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.264235636489542e-06, |
|
"loss": 1.6425, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.4010316998086363, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.192970442890602e-06, |
|
"loss": 1.6466, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.4063566020467593, |
|
"grad_norm": 1.0, |
|
"learning_rate": 4.12214747707527e-06, |
|
"loss": 1.6378, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.4116815042848823, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.051772132486589e-06, |
|
"loss": 1.718, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.4170064065230052, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.981849768479516e-06, |
|
"loss": 1.6534, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.4223313087611282, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 3.912385709912794e-06, |
|
"loss": 1.6769, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.4276562109992512, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.8433852467434175e-06, |
|
"loss": 1.6337, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4329811132373742, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 3.774853633623806e-06, |
|
"loss": 1.7151, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.4383060154754972, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 3.7067960895016277e-06, |
|
"loss": 1.6018, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.4436309177136202, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.6392177972223596e-06, |
|
"loss": 1.597, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.4489558199517432, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 3.5721239031346067e-06, |
|
"loss": 1.6465, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.454280722189866, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.505519516698165e-06, |
|
"loss": 1.7173, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.4596056244279891, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 3.4394097100949286e-06, |
|
"loss": 1.5711, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.464930526666112, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.3737995178426276e-06, |
|
"loss": 1.7184, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.4702554289042349, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 3.308693936411421e-06, |
|
"loss": 1.6222, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.4755803311423579, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 3.2440979238433977e-06, |
|
"loss": 1.7164, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.4809052333804809, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.1800163993750166e-06, |
|
"loss": 1.8001, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4862301356186038, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.116454243062459e-06, |
|
"loss": 1.6933, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.4915550378567268, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 3.0534162954100264e-06, |
|
"loss": 1.6367, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.4915550378567268, |
|
"eval_loss": 1.6898616552352905, |
|
"eval_runtime": 78.3804, |
|
"eval_samples_per_second": 15.055, |
|
"eval_steps_per_second": 15.055, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.4968799400948498, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.990907357001491e-06, |
|
"loss": 1.6462, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.5022048423329728, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.9289321881345257e-06, |
|
"loss": 1.6232, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.5075297445710958, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.867495508458186e-06, |
|
"loss": 1.7229, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.5128546468092188, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 2.8066019966134907e-06, |
|
"loss": 1.6581, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.5181795490473418, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 2.746256289877126e-06, |
|
"loss": 1.6565, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.5235044512854645, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.6864629838082957e-06, |
|
"loss": 1.646, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.5288293535235877, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 2.6272266318987606e-06, |
|
"loss": 1.6165, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.5341542557617105, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 2.5685517452260566e-06, |
|
"loss": 1.6342, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.5394791579998337, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 2.5104427921099783e-06, |
|
"loss": 1.6765, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.5448040602379565, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 2.45290419777228e-06, |
|
"loss": 1.6634, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.5501289624760797, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 2.395940343999691e-06, |
|
"loss": 1.6478, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.5554538647142024, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 2.339555568810221e-06, |
|
"loss": 1.6321, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.5607787669523256, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 2.2837541661228024e-06, |
|
"loss": 1.7033, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.5661036691904484, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 2.2285403854302912e-06, |
|
"loss": 1.562, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 2.173918431475861e-06, |
|
"loss": 1.6686, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.5767534736666944, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.119892463932781e-06, |
|
"loss": 1.5676, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.5820783759048174, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.0664665970876496e-06, |
|
"loss": 1.6761, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.5874032781429404, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.013644899527074e-06, |
|
"loss": 1.6786, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5927281803810633, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.961431393827827e-06, |
|
"loss": 1.6917, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.5980530826191863, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.9098300562505266e-06, |
|
"loss": 1.6204, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.6033779848573093, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.858844816436809e-06, |
|
"loss": 1.6689, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.6087028870954323, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 1.808479557110081e-06, |
|
"loss": 1.7041, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.614027789333555, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 1.7587381137798432e-06, |
|
"loss": 1.6656, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.6193526915716783, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.709624274449584e-06, |
|
"loss": 1.6269, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.624677593809801, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.6611417793283192e-06, |
|
"loss": 1.5737, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.6300024960479242, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.6132943205457607e-06, |
|
"loss": 1.722, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.635327398286047, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 1.566085541871145e-06, |
|
"loss": 1.683, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.6406523005241702, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.5195190384357405e-06, |
|
"loss": 1.6843, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.645977202762293, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.4735983564590784e-06, |
|
"loss": 1.6979, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.651302105000416, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.4283269929788779e-06, |
|
"loss": 1.6396, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.656627007238539, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 1.3837083955847418e-06, |
|
"loss": 1.6738, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.661951909476662, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 1.339745962155613e-06, |
|
"loss": 1.6566, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.667276811714785, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.2964430406010032e-06, |
|
"loss": 1.6679, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.672601713952908, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.2538029286060428e-06, |
|
"loss": 1.6892, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.677926616191031, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.2118288733803474e-06, |
|
"loss": 1.6914, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.6832515184291539, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.1705240714107301e-06, |
|
"loss": 1.5954, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.6885764206672769, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 1.129891668217783e-06, |
|
"loss": 1.633, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.6939013229053996, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.0899347581163222e-06, |
|
"loss": 1.6976, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.6992262251435228, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.0506563839797501e-06, |
|
"loss": 1.6283, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.7045511273816456, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 1.012059537008332e-06, |
|
"loss": 1.6094, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.7098760296197688, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 9.74147156501396e-07, |
|
"loss": 1.6049, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.7152009318578916, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.369221296335007e-07, |
|
"loss": 1.6529, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.7205258340960148, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 9.00387291234569e-07, |
|
"loss": 1.7539, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.7258507363341375, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 8.645454235739903e-07, |
|
"loss": 1.6844, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.7311756385722605, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 8.293992561487596e-07, |
|
"loss": 1.6409, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.7365005408103835, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 7.949514654755963e-07, |
|
"loss": 1.7356, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.7418254430485065, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 7.612046748871327e-07, |
|
"loss": 1.6681, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.7418254430485065, |
|
"eval_loss": 1.6896613836288452, |
|
"eval_runtime": 77.1918, |
|
"eval_samples_per_second": 15.287, |
|
"eval_steps_per_second": 15.287, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.7471503452866295, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 7.281614543321269e-07, |
|
"loss": 1.6493, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.7524752475247525, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 6.958243201797554e-07, |
|
"loss": 1.6299, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.7578001497628755, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 6.641957350279838e-07, |
|
"loss": 1.6882, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.7631250520009984, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 6.332781075160244e-07, |
|
"loss": 1.6256, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.7684499542391214, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 1.5845, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.7737748564772442, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 5.735850890782158e-07, |
|
"loss": 1.6066, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.7790997587153674, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 5.448142440068316e-07, |
|
"loss": 1.6751, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.7844246609534902, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 5.167634479380068e-07, |
|
"loss": 1.6368, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.7897495631916134, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.894348370484648e-07, |
|
"loss": 1.7731, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.7950744654297361, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.628304925177318e-07, |
|
"loss": 1.6714, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.8003993676678594, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.3695244036964567e-07, |
|
"loss": 1.7154, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.8057242699059821, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.118026513180695e-07, |
|
"loss": 1.5225, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.8110491721441053, |
|
"grad_norm": 1.0, |
|
"learning_rate": 3.8738304061681107e-07, |
|
"loss": 1.6143, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.816374074382228, |
|
"grad_norm": 1.0, |
|
"learning_rate": 3.6369546791377054e-07, |
|
"loss": 1.6301, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.821698976620351, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 3.4074173710931804e-07, |
|
"loss": 1.5861, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.827023878858474, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 3.185235962189237e-07, |
|
"loss": 1.6469, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.832348781096597, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 2.970427372400353e-07, |
|
"loss": 1.5941, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.83767368333472, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 2.7630079602323447e-07, |
|
"loss": 1.6405, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.842998585572843, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 2.5629935214764866e-07, |
|
"loss": 1.6329, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.848323487810966, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 2.370399288006664e-07, |
|
"loss": 1.62, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.853648390049089, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 2.1852399266194312e-07, |
|
"loss": 1.5775, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.858973292287212, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 2.0075295379170413e-07, |
|
"loss": 1.6226, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.8642981945253347, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.8372816552336025e-07, |
|
"loss": 1.5957, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.869623096763458, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.6745092436045495e-07, |
|
"loss": 1.6389, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.8749479990015807, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.519224698779198e-07, |
|
"loss": 1.6588, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.880272901239704, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 1.3714398462768563e-07, |
|
"loss": 1.5915, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.8855978034778267, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.231165940486234e-07, |
|
"loss": 1.6454, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.89092270571595, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.0984136638083176e-07, |
|
"loss": 1.6706, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.8962476079540727, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 9.731931258429638e-08, |
|
"loss": 1.6479, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.9015725101921956, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 8.555138626189619e-08, |
|
"loss": 1.6405, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.9068974124303186, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 7.453848358678018e-08, |
|
"loss": 1.6638, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.9122223146684416, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 6.428144323412544e-08, |
|
"loss": 1.6709, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.9175472169065646, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 5.4781046317267103e-08, |
|
"loss": 1.6191, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.9228721191446876, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.603801632821148e-08, |
|
"loss": 1.7144, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.9281970213828106, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.805301908254455e-08, |
|
"loss": 1.6358, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.9335219236209336, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.082666266872036e-08, |
|
"loss": 1.6961, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.9388468258590565, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.4359497401758026e-08, |
|
"loss": 1.6074, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.9441717280971793, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.86520157813308e-08, |
|
"loss": 1.5905, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.9494966303353025, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 1.370465245426167e-08, |
|
"loss": 1.6317, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.9548215325734253, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 9.517784181422018e-09, |
|
"loss": 1.5783, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.9601464348115485, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 6.091729809042379e-09, |
|
"loss": 1.7104, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.9654713370496713, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.4267502444274013e-09, |
|
"loss": 1.7328, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.9707962392877945, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.5230484360873043e-09, |
|
"loss": 1.6749, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.9761211415259172, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 3.807693582869032e-10, |
|
"loss": 1.5706, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.9814460437640404, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0, |
|
"loss": 1.662, |
|
"step": 374 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 374, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 187, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.8540467560146534e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|