|
{ |
|
"best_metric": 1.9184670448303223, |
|
"best_model_checkpoint": "/home/datta0/models/lora_final/Qwen2-7B_pct_ortho_r16/checkpoint-32", |
|
"epoch": 0.9996779388083736, |
|
"eval_steps": 8, |
|
"global_step": 388, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025764895330112722, |
|
"grad_norm": 2.5038321018218994, |
|
"learning_rate": 1.25e-05, |
|
"loss": 2.0463, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010305958132045089, |
|
"grad_norm": 2.0884063243865967, |
|
"learning_rate": 5e-05, |
|
"loss": 2.1379, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.020611916264090178, |
|
"grad_norm": 2.1778032779693604, |
|
"learning_rate": 0.0001, |
|
"loss": 2.0486, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.020611916264090178, |
|
"eval_loss": 1.997605562210083, |
|
"eval_runtime": 130.7432, |
|
"eval_samples_per_second": 1.874, |
|
"eval_steps_per_second": 0.941, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.030917874396135265, |
|
"grad_norm": 1.3578734397888184, |
|
"learning_rate": 9.997266286704631e-05, |
|
"loss": 2.0017, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.041223832528180356, |
|
"grad_norm": 1.0907150506973267, |
|
"learning_rate": 9.989068136093873e-05, |
|
"loss": 1.9839, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.041223832528180356, |
|
"eval_loss": 1.9347867965698242, |
|
"eval_runtime": 133.851, |
|
"eval_samples_per_second": 1.83, |
|
"eval_steps_per_second": 0.919, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05152979066022544, |
|
"grad_norm": 1.1747814416885376, |
|
"learning_rate": 9.975414512725057e-05, |
|
"loss": 1.9421, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06183574879227053, |
|
"grad_norm": 1.2741020917892456, |
|
"learning_rate": 9.956320346634876e-05, |
|
"loss": 2.0083, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06183574879227053, |
|
"eval_loss": 1.9231261014938354, |
|
"eval_runtime": 123.8213, |
|
"eval_samples_per_second": 1.979, |
|
"eval_steps_per_second": 0.993, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07214170692431562, |
|
"grad_norm": 1.1227424144744873, |
|
"learning_rate": 9.931806517013612e-05, |
|
"loss": 1.9946, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08244766505636071, |
|
"grad_norm": 1.14728581905365, |
|
"learning_rate": 9.901899829374047e-05, |
|
"loss": 1.923, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08244766505636071, |
|
"eval_loss": 1.9184670448303223, |
|
"eval_runtime": 236.0583, |
|
"eval_samples_per_second": 1.038, |
|
"eval_steps_per_second": 0.521, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0927536231884058, |
|
"grad_norm": 1.0226891040802002, |
|
"learning_rate": 9.86663298624003e-05, |
|
"loss": 1.9968, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10305958132045089, |
|
"grad_norm": 1.1554687023162842, |
|
"learning_rate": 9.826044551386744e-05, |
|
"loss": 1.9734, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10305958132045089, |
|
"eval_loss": 1.9200371503829956, |
|
"eval_runtime": 131.8583, |
|
"eval_samples_per_second": 1.858, |
|
"eval_steps_per_second": 0.933, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11336553945249597, |
|
"grad_norm": 1.1349327564239502, |
|
"learning_rate": 9.780178907671789e-05, |
|
"loss": 1.8989, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12367149758454106, |
|
"grad_norm": 0.9330633282661438, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 1.9761, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12367149758454106, |
|
"eval_loss": 1.9230433702468872, |
|
"eval_runtime": 117.734, |
|
"eval_samples_per_second": 2.081, |
|
"eval_steps_per_second": 1.045, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13397745571658615, |
|
"grad_norm": 1.2882357835769653, |
|
"learning_rate": 9.672822322997305e-05, |
|
"loss": 1.9139, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.14428341384863125, |
|
"grad_norm": 1.2212823629379272, |
|
"learning_rate": 9.611448774886924e-05, |
|
"loss": 1.9869, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14428341384863125, |
|
"eval_loss": 1.922610878944397, |
|
"eval_runtime": 133.6918, |
|
"eval_samples_per_second": 1.833, |
|
"eval_steps_per_second": 0.92, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15458937198067632, |
|
"grad_norm": 1.14687979221344, |
|
"learning_rate": 9.545032675245813e-05, |
|
"loss": 1.9617, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16489533011272142, |
|
"grad_norm": 1.2522599697113037, |
|
"learning_rate": 9.473646649103818e-05, |
|
"loss": 1.9196, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16489533011272142, |
|
"eval_loss": 1.923781156539917, |
|
"eval_runtime": 122.9351, |
|
"eval_samples_per_second": 1.993, |
|
"eval_steps_per_second": 1.001, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1752012882447665, |
|
"grad_norm": 1.1364014148712158, |
|
"learning_rate": 9.397368756032445e-05, |
|
"loss": 2.0391, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1855072463768116, |
|
"grad_norm": 1.2489475011825562, |
|
"learning_rate": 9.316282404787871e-05, |
|
"loss": 1.9871, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1855072463768116, |
|
"eval_loss": 1.9276381731033325, |
|
"eval_runtime": 228.2046, |
|
"eval_samples_per_second": 1.074, |
|
"eval_steps_per_second": 0.539, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19581320450885667, |
|
"grad_norm": 1.3833423852920532, |
|
"learning_rate": 9.230476262104677e-05, |
|
"loss": 2.0093, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.20611916264090177, |
|
"grad_norm": 1.1054407358169556, |
|
"learning_rate": 9.140044155740101e-05, |
|
"loss": 2.0064, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20611916264090177, |
|
"eval_loss": 1.9251201152801514, |
|
"eval_runtime": 129.5681, |
|
"eval_samples_per_second": 1.891, |
|
"eval_steps_per_second": 0.949, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21642512077294687, |
|
"grad_norm": 1.2429131269454956, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 2.064, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22673107890499195, |
|
"grad_norm": 1.23136568069458, |
|
"learning_rate": 8.945702546981969e-05, |
|
"loss": 1.9864, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22673107890499195, |
|
"eval_loss": 1.9282381534576416, |
|
"eval_runtime": 127.4266, |
|
"eval_samples_per_second": 1.923, |
|
"eval_steps_per_second": 0.965, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.23703703703703705, |
|
"grad_norm": 1.0550172328948975, |
|
"learning_rate": 8.842005554284296e-05, |
|
"loss": 1.903, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.24734299516908212, |
|
"grad_norm": 1.214440941810608, |
|
"learning_rate": 8.73410738492077e-05, |
|
"loss": 1.9204, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.24734299516908212, |
|
"eval_loss": 1.9318650960922241, |
|
"eval_runtime": 136.7115, |
|
"eval_samples_per_second": 1.792, |
|
"eval_steps_per_second": 0.9, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2576489533011272, |
|
"grad_norm": 1.0632654428482056, |
|
"learning_rate": 8.622126023955446e-05, |
|
"loss": 2.0197, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2679549114331723, |
|
"grad_norm": 1.2511614561080933, |
|
"learning_rate": 8.506183921362443e-05, |
|
"loss": 2.0003, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2679549114331723, |
|
"eval_loss": 1.9295417070388794, |
|
"eval_runtime": 133.2173, |
|
"eval_samples_per_second": 1.839, |
|
"eval_steps_per_second": 0.923, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2782608695652174, |
|
"grad_norm": 1.1069486141204834, |
|
"learning_rate": 8.386407858128706e-05, |
|
"loss": 2.0178, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2885668276972625, |
|
"grad_norm": 1.060439944267273, |
|
"learning_rate": 8.262928807620843e-05, |
|
"loss": 1.8821, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2885668276972625, |
|
"eval_loss": 1.9357030391693115, |
|
"eval_runtime": 233.1725, |
|
"eval_samples_per_second": 1.051, |
|
"eval_steps_per_second": 0.528, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.29887278582930754, |
|
"grad_norm": 1.0485658645629883, |
|
"learning_rate": 8.135881792367686e-05, |
|
"loss": 2.0509, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.30917874396135264, |
|
"grad_norm": 1.1843186616897583, |
|
"learning_rate": 8.005405736415126e-05, |
|
"loss": 1.9353, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.30917874396135264, |
|
"eval_loss": 1.9354380369186401, |
|
"eval_runtime": 117.1403, |
|
"eval_samples_per_second": 2.092, |
|
"eval_steps_per_second": 1.05, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.31948470209339774, |
|
"grad_norm": 1.2164537906646729, |
|
"learning_rate": 7.871643313414718e-05, |
|
"loss": 2.0319, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.32979066022544284, |
|
"grad_norm": 1.0794636011123657, |
|
"learning_rate": 7.734740790612136e-05, |
|
"loss": 1.9737, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.32979066022544284, |
|
"eval_loss": 1.939180850982666, |
|
"eval_runtime": 132.7839, |
|
"eval_samples_per_second": 1.845, |
|
"eval_steps_per_second": 0.926, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.34009661835748795, |
|
"grad_norm": 1.2831894159317017, |
|
"learning_rate": 7.594847868906076e-05, |
|
"loss": 1.9799, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.350402576489533, |
|
"grad_norm": 1.066001057624817, |
|
"learning_rate": 7.452117519152542e-05, |
|
"loss": 1.9608, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.350402576489533, |
|
"eval_loss": 1.933682918548584, |
|
"eval_runtime": 118.9246, |
|
"eval_samples_per_second": 2.06, |
|
"eval_steps_per_second": 1.034, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3607085346215781, |
|
"grad_norm": 1.1267732381820679, |
|
"learning_rate": 7.30670581489344e-05, |
|
"loss": 1.9253, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3710144927536232, |
|
"grad_norm": 1.232575535774231, |
|
"learning_rate": 7.158771761692464e-05, |
|
"loss": 1.928, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3710144927536232, |
|
"eval_loss": 1.9364768266677856, |
|
"eval_runtime": 232.9722, |
|
"eval_samples_per_second": 1.052, |
|
"eval_steps_per_second": 0.528, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3813204508856683, |
|
"grad_norm": 1.1609725952148438, |
|
"learning_rate": 7.008477123264848e-05, |
|
"loss": 1.9995, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.39162640901771334, |
|
"grad_norm": 1.2346417903900146, |
|
"learning_rate": 6.855986244591104e-05, |
|
"loss": 2.0019, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.39162640901771334, |
|
"eval_loss": 1.9326261281967163, |
|
"eval_runtime": 248.862, |
|
"eval_samples_per_second": 0.984, |
|
"eval_steps_per_second": 0.494, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.40193236714975844, |
|
"grad_norm": 1.368483304977417, |
|
"learning_rate": 6.701465872208216e-05, |
|
"loss": 1.9931, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.41223832528180354, |
|
"grad_norm": 1.1064777374267578, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 2.0525, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.41223832528180354, |
|
"eval_loss": 1.940269112586975, |
|
"eval_runtime": 124.8033, |
|
"eval_samples_per_second": 1.963, |
|
"eval_steps_per_second": 0.986, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.42254428341384864, |
|
"grad_norm": 1.1312705278396606, |
|
"learning_rate": 6.387014543809223e-05, |
|
"loss": 2.0432, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.43285024154589374, |
|
"grad_norm": 1.2631207704544067, |
|
"learning_rate": 6.227427435703997e-05, |
|
"loss": 2.053, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.43285024154589374, |
|
"eval_loss": 1.9401638507843018, |
|
"eval_runtime": 127.3455, |
|
"eval_samples_per_second": 1.924, |
|
"eval_steps_per_second": 0.966, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4431561996779388, |
|
"grad_norm": 1.2143282890319824, |
|
"learning_rate": 6.066498153718735e-05, |
|
"loss": 1.9412, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4534621578099839, |
|
"grad_norm": 1.3119618892669678, |
|
"learning_rate": 5.90440267166055e-05, |
|
"loss": 1.9342, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4534621578099839, |
|
"eval_loss": 1.9374455213546753, |
|
"eval_runtime": 128.0971, |
|
"eval_samples_per_second": 1.913, |
|
"eval_steps_per_second": 0.96, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.463768115942029, |
|
"grad_norm": 1.404941439628601, |
|
"learning_rate": 5.74131823855921e-05, |
|
"loss": 2.0474, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4740740740740741, |
|
"grad_norm": 1.1769001483917236, |
|
"learning_rate": 5.577423184847932e-05, |
|
"loss": 1.9931, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4740740740740741, |
|
"eval_loss": 1.9400098323822021, |
|
"eval_runtime": 255.4556, |
|
"eval_samples_per_second": 0.959, |
|
"eval_steps_per_second": 0.481, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.48438003220611914, |
|
"grad_norm": 1.0599452257156372, |
|
"learning_rate": 5.4128967273616625e-05, |
|
"loss": 1.9876, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.49468599033816424, |
|
"grad_norm": 1.1136999130249023, |
|
"learning_rate": 5.247918773366112e-05, |
|
"loss": 2.0008, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.49468599033816424, |
|
"eval_loss": 1.9412612915039062, |
|
"eval_runtime": 120.4032, |
|
"eval_samples_per_second": 2.035, |
|
"eval_steps_per_second": 1.022, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5049919484702093, |
|
"grad_norm": 1.4237738847732544, |
|
"learning_rate": 5.0826697238317935e-05, |
|
"loss": 1.9186, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5152979066022544, |
|
"grad_norm": 1.5648188591003418, |
|
"learning_rate": 4.917330276168208e-05, |
|
"loss": 1.9426, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5152979066022544, |
|
"eval_loss": 1.9405626058578491, |
|
"eval_runtime": 128.5673, |
|
"eval_samples_per_second": 1.906, |
|
"eval_steps_per_second": 0.957, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5256038647342995, |
|
"grad_norm": 1.2770507335662842, |
|
"learning_rate": 4.7520812266338885e-05, |
|
"loss": 2.062, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5359098228663446, |
|
"grad_norm": 1.2542366981506348, |
|
"learning_rate": 4.5871032726383386e-05, |
|
"loss": 1.9732, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5359098228663446, |
|
"eval_loss": 1.9409449100494385, |
|
"eval_runtime": 121.087, |
|
"eval_samples_per_second": 2.023, |
|
"eval_steps_per_second": 1.016, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5462157809983897, |
|
"grad_norm": 1.4167399406433105, |
|
"learning_rate": 4.4225768151520694e-05, |
|
"loss": 2.0026, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5565217391304348, |
|
"grad_norm": 1.3207523822784424, |
|
"learning_rate": 4.2586817614407895e-05, |
|
"loss": 2.0263, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5565217391304348, |
|
"eval_loss": 1.9430506229400635, |
|
"eval_runtime": 119.0525, |
|
"eval_samples_per_second": 2.058, |
|
"eval_steps_per_second": 1.033, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5668276972624798, |
|
"grad_norm": 1.2683689594268799, |
|
"learning_rate": 4.095597328339452e-05, |
|
"loss": 2.0507, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.577133655394525, |
|
"grad_norm": 1.2979661226272583, |
|
"learning_rate": 3.933501846281267e-05, |
|
"loss": 1.9589, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.577133655394525, |
|
"eval_loss": 1.944420337677002, |
|
"eval_runtime": 241.5642, |
|
"eval_samples_per_second": 1.014, |
|
"eval_steps_per_second": 0.509, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.58743961352657, |
|
"grad_norm": 1.3029229640960693, |
|
"learning_rate": 3.772572564296005e-05, |
|
"loss": 2.0413, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5977455716586151, |
|
"grad_norm": 1.288469910621643, |
|
"learning_rate": 3.612985456190778e-05, |
|
"loss": 1.9824, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5977455716586151, |
|
"eval_loss": 1.9459607601165771, |
|
"eval_runtime": 135.6582, |
|
"eval_samples_per_second": 1.806, |
|
"eval_steps_per_second": 0.907, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6080515297906602, |
|
"grad_norm": 1.2693860530853271, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 1.9893, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6183574879227053, |
|
"grad_norm": 1.4493002891540527, |
|
"learning_rate": 3.298534127791785e-05, |
|
"loss": 1.9252, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6183574879227053, |
|
"eval_loss": 1.9398853778839111, |
|
"eval_runtime": 123.6178, |
|
"eval_samples_per_second": 1.982, |
|
"eval_steps_per_second": 0.995, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6286634460547504, |
|
"grad_norm": 1.1343238353729248, |
|
"learning_rate": 3.144013755408895e-05, |
|
"loss": 1.9731, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6389694041867955, |
|
"grad_norm": 1.2282459735870361, |
|
"learning_rate": 2.991522876735154e-05, |
|
"loss": 1.9563, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6389694041867955, |
|
"eval_loss": 1.9399926662445068, |
|
"eval_runtime": 114.543, |
|
"eval_samples_per_second": 2.139, |
|
"eval_steps_per_second": 1.074, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6492753623188405, |
|
"grad_norm": 1.312157154083252, |
|
"learning_rate": 2.8412282383075363e-05, |
|
"loss": 1.9806, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6595813204508857, |
|
"grad_norm": 1.2436367273330688, |
|
"learning_rate": 2.693294185106562e-05, |
|
"loss": 2.0096, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6595813204508857, |
|
"eval_loss": 1.9414441585540771, |
|
"eval_runtime": 132.1293, |
|
"eval_samples_per_second": 1.854, |
|
"eval_steps_per_second": 0.931, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6698872785829307, |
|
"grad_norm": 1.241859793663025, |
|
"learning_rate": 2.547882480847461e-05, |
|
"loss": 1.9994, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6801932367149759, |
|
"grad_norm": 1.2852636575698853, |
|
"learning_rate": 2.405152131093926e-05, |
|
"loss": 1.9355, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6801932367149759, |
|
"eval_loss": 1.9419505596160889, |
|
"eval_runtime": 240.4599, |
|
"eval_samples_per_second": 1.019, |
|
"eval_steps_per_second": 0.512, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6904991948470209, |
|
"grad_norm": 1.2122435569763184, |
|
"learning_rate": 2.2652592093878666e-05, |
|
"loss": 2.0342, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.700805152979066, |
|
"grad_norm": 1.203086495399475, |
|
"learning_rate": 2.128356686585282e-05, |
|
"loss": 2.003, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.700805152979066, |
|
"eval_loss": 1.941483497619629, |
|
"eval_runtime": 127.3272, |
|
"eval_samples_per_second": 1.924, |
|
"eval_steps_per_second": 0.966, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 1.333521842956543, |
|
"learning_rate": 1.9945942635848748e-05, |
|
"loss": 1.9418, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7214170692431562, |
|
"grad_norm": 1.4001682996749878, |
|
"learning_rate": 1.8641182076323148e-05, |
|
"loss": 1.877, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7214170692431562, |
|
"eval_loss": 1.9395906925201416, |
|
"eval_runtime": 138.1205, |
|
"eval_samples_per_second": 1.774, |
|
"eval_steps_per_second": 0.891, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7317230273752013, |
|
"grad_norm": 1.3213990926742554, |
|
"learning_rate": 1.7370711923791567e-05, |
|
"loss": 2.0164, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7420289855072464, |
|
"grad_norm": 1.401374340057373, |
|
"learning_rate": 1.6135921418712956e-05, |
|
"loss": 2.0395, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7420289855072464, |
|
"eval_loss": 1.937800407409668, |
|
"eval_runtime": 131.2826, |
|
"eval_samples_per_second": 1.866, |
|
"eval_steps_per_second": 0.937, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7523349436392914, |
|
"grad_norm": 1.3368889093399048, |
|
"learning_rate": 1.4938160786375572e-05, |
|
"loss": 2.0054, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7626409017713366, |
|
"grad_norm": 1.3905627727508545, |
|
"learning_rate": 1.3778739760445552e-05, |
|
"loss": 1.9447, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7626409017713366, |
|
"eval_loss": 1.9381885528564453, |
|
"eval_runtime": 123.5018, |
|
"eval_samples_per_second": 1.984, |
|
"eval_steps_per_second": 0.996, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7729468599033816, |
|
"grad_norm": 1.365517020225525, |
|
"learning_rate": 1.2658926150792322e-05, |
|
"loss": 2.0237, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7832528180354267, |
|
"grad_norm": 1.2530505657196045, |
|
"learning_rate": 1.157994445715706e-05, |
|
"loss": 1.965, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7832528180354267, |
|
"eval_loss": 1.9391155242919922, |
|
"eval_runtime": 238.0568, |
|
"eval_samples_per_second": 1.029, |
|
"eval_steps_per_second": 0.517, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7935587761674718, |
|
"grad_norm": 1.2440646886825562, |
|
"learning_rate": 1.0542974530180327e-05, |
|
"loss": 1.9169, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8038647342995169, |
|
"grad_norm": 1.2581100463867188, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 1.9656, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8038647342995169, |
|
"eval_loss": 1.935294508934021, |
|
"eval_runtime": 113.5421, |
|
"eval_samples_per_second": 2.158, |
|
"eval_steps_per_second": 1.083, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.814170692431562, |
|
"grad_norm": 1.2223581075668335, |
|
"learning_rate": 8.599558442598998e-06, |
|
"loss": 1.9945, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8244766505636071, |
|
"grad_norm": 1.2913447618484497, |
|
"learning_rate": 7.695237378953223e-06, |
|
"loss": 1.9928, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8244766505636071, |
|
"eval_loss": 1.9398421049118042, |
|
"eval_runtime": 127.4313, |
|
"eval_samples_per_second": 1.923, |
|
"eval_steps_per_second": 0.965, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8347826086956521, |
|
"grad_norm": 1.181248664855957, |
|
"learning_rate": 6.837175952121306e-06, |
|
"loss": 2.0377, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8450885668276973, |
|
"grad_norm": 1.2499854564666748, |
|
"learning_rate": 6.026312439675552e-06, |
|
"loss": 2.0004, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8450885668276973, |
|
"eval_loss": 1.9392446279525757, |
|
"eval_runtime": 122.338, |
|
"eval_samples_per_second": 2.003, |
|
"eval_steps_per_second": 1.005, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8553945249597423, |
|
"grad_norm": 1.2896592617034912, |
|
"learning_rate": 5.263533508961827e-06, |
|
"loss": 1.9349, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8657004830917875, |
|
"grad_norm": 1.1416115760803223, |
|
"learning_rate": 4.549673247541875e-06, |
|
"loss": 1.9883, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8657004830917875, |
|
"eval_loss": 1.9389346837997437, |
|
"eval_runtime": 229.9634, |
|
"eval_samples_per_second": 1.065, |
|
"eval_steps_per_second": 0.535, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8760064412238325, |
|
"grad_norm": 1.350136399269104, |
|
"learning_rate": 3.885512251130763e-06, |
|
"loss": 1.9733, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8863123993558776, |
|
"grad_norm": 1.3249493837356567, |
|
"learning_rate": 3.271776770026963e-06, |
|
"loss": 1.9764, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8863123993558776, |
|
"eval_loss": 1.9394721984863281, |
|
"eval_runtime": 123.8835, |
|
"eval_samples_per_second": 1.978, |
|
"eval_steps_per_second": 0.993, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8966183574879227, |
|
"grad_norm": 1.1916362047195435, |
|
"learning_rate": 2.7091379149682685e-06, |
|
"loss": 1.918, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9069243156199678, |
|
"grad_norm": 1.3328787088394165, |
|
"learning_rate": 2.1982109232821178e-06, |
|
"loss": 1.9474, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9069243156199678, |
|
"eval_loss": 1.9390062093734741, |
|
"eval_runtime": 132.5632, |
|
"eval_samples_per_second": 1.848, |
|
"eval_steps_per_second": 0.928, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9172302737520129, |
|
"grad_norm": 1.3123880624771118, |
|
"learning_rate": 1.7395544861325718e-06, |
|
"loss": 2.0644, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.927536231884058, |
|
"grad_norm": 1.2262526750564575, |
|
"learning_rate": 1.333670137599713e-06, |
|
"loss": 2.0375, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.927536231884058, |
|
"eval_loss": 1.9382147789001465, |
|
"eval_runtime": 125.7134, |
|
"eval_samples_per_second": 1.949, |
|
"eval_steps_per_second": 0.978, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.937842190016103, |
|
"grad_norm": 1.253958821296692, |
|
"learning_rate": 9.810017062595322e-07, |
|
"loss": 2.0065, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9481481481481482, |
|
"grad_norm": 1.2366716861724854, |
|
"learning_rate": 6.819348298638839e-07, |
|
"loss": 1.9424, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9481481481481482, |
|
"eval_loss": 1.9385864734649658, |
|
"eval_runtime": 48.1426, |
|
"eval_samples_per_second": 5.089, |
|
"eval_steps_per_second": 2.555, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9584541062801932, |
|
"grad_norm": 1.3085380792617798, |
|
"learning_rate": 4.367965336512403e-07, |
|
"loss": 1.9222, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9687600644122383, |
|
"grad_norm": 1.3303335905075073, |
|
"learning_rate": 2.458548727494292e-07, |
|
"loss": 2.0088, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9687600644122383, |
|
"eval_loss": 1.9384690523147583, |
|
"eval_runtime": 83.7915, |
|
"eval_samples_per_second": 2.924, |
|
"eval_steps_per_second": 1.468, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9790660225442834, |
|
"grad_norm": 1.3368890285491943, |
|
"learning_rate": 1.0931863906127327e-07, |
|
"loss": 1.8973, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9893719806763285, |
|
"grad_norm": 1.4014090299606323, |
|
"learning_rate": 2.7337132953697554e-08, |
|
"loss": 1.9043, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9893719806763285, |
|
"eval_loss": 1.9385173320770264, |
|
"eval_runtime": 46.0839, |
|
"eval_samples_per_second": 5.316, |
|
"eval_steps_per_second": 2.669, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9996779388083736, |
|
"grad_norm": 1.3734824657440186, |
|
"learning_rate": 0.0, |
|
"loss": 2.0065, |
|
"step": 388 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 388, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 8, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.427475676604334e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|