|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 300, |
|
"global_step": 819, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003663003663003663, |
|
"grad_norm": 165.0, |
|
"learning_rate": 2e-06, |
|
"loss": 9.1083, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.018315018315018316, |
|
"grad_norm": 64.0, |
|
"learning_rate": 1.9998820020169668e-06, |
|
"loss": 8.3175, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03663003663003663, |
|
"grad_norm": 26.875, |
|
"learning_rate": 1.999402682936637e-06, |
|
"loss": 7.3293, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.054945054945054944, |
|
"grad_norm": 16.75, |
|
"learning_rate": 1.998554844493029e-06, |
|
"loss": 6.8602, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07326007326007326, |
|
"grad_norm": 11.0, |
|
"learning_rate": 1.997338799317767e-06, |
|
"loss": 6.6054, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09157509157509157, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.995754995814884e-06, |
|
"loss": 6.3957, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10989010989010989, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 1.9938040179954784e-06, |
|
"loss": 6.256, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1282051282051282, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 1.991486585262365e-06, |
|
"loss": 6.1321, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.14652014652014653, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.988803552144804e-06, |
|
"loss": 6.018, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16483516483516483, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.9857559079834022e-06, |
|
"loss": 5.9438, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.18315018315018314, |
|
"grad_norm": 6.5, |
|
"learning_rate": 1.982344776565302e-06, |
|
"loss": 5.8733, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.20146520146520147, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.978571415709799e-06, |
|
"loss": 5.8341, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.21978021978021978, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.9744372168045322e-06, |
|
"loss": 5.793, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.9699437042924264e-06, |
|
"loss": 5.7305, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.965092535109567e-06, |
|
"loss": 5.671, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.27472527472527475, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.959885498074224e-06, |
|
"loss": 5.6563, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.29304029304029305, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 1.954324513227244e-06, |
|
"loss": 5.5778, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.31135531135531136, |
|
"grad_norm": 3.75, |
|
"learning_rate": 1.948411631124053e-06, |
|
"loss": 5.5838, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.32967032967032966, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 1.942149032078538e-06, |
|
"loss": 5.5305, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.34798534798534797, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.935539025359077e-06, |
|
"loss": 5.5108, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3663003663003663, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.928584048337022e-06, |
|
"loss": 5.494, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 3.75, |
|
"learning_rate": 1.9212866655879395e-06, |
|
"loss": 5.4749, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.40293040293040294, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.9136495679459563e-06, |
|
"loss": 5.4473, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.42124542124542125, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.9056755715115372e-06, |
|
"loss": 5.4071, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.43956043956043955, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.8973676166130791e-06, |
|
"loss": 5.3994, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.45787545787545786, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.8887287667226963e-06, |
|
"loss": 5.3773, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.8797622073265943e-06, |
|
"loss": 5.3734, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4945054945054945, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.8704712447504579e-06, |
|
"loss": 5.323, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.8608593049402752e-06, |
|
"loss": 5.2964, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5311355311355311, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.850929932199058e-06, |
|
"loss": 5.2951, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5494505494505495, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.8406867878799152e-06, |
|
"loss": 5.2498, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5677655677655677, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.8301336490359678e-06, |
|
"loss": 5.2432, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5860805860805861, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 1.819274407027599e-06, |
|
"loss": 5.2436, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6043956043956044, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.8081130660875555e-06, |
|
"loss": 5.2218, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6227106227106227, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.79665374184443e-06, |
|
"loss": 5.2194, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6410256410256411, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.7849006598050625e-06, |
|
"loss": 5.1988, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6593406593406593, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 1.7728581537964318e-06, |
|
"loss": 5.1814, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6776556776556777, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.7605306643676006e-06, |
|
"loss": 5.1318, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6959706959706959, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.747922737152308e-06, |
|
"loss": 5.1609, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.7350390211928166e-06, |
|
"loss": 5.1395, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7326007326007326, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.721884267225624e-06, |
|
"loss": 5.0794, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7509157509157509, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 1.7084633259296795e-06, |
|
"loss": 5.0974, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.6947811461377467e-06, |
|
"loss": 5.1007, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7875457875457875, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.6808427730115712e-06, |
|
"loss": 5.0777, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8058608058608059, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 1.6666533461815323e-06, |
|
"loss": 5.0517, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8241758241758241, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.6522180978514552e-06, |
|
"loss": 5.0462, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.8424908424908425, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 1.6375423508692912e-06, |
|
"loss": 5.0538, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8608058608058609, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.622631516764372e-06, |
|
"loss": 5.0335, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8791208791208791, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.607491093751966e-06, |
|
"loss": 5.0177, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8974358974358975, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.592126664705868e-06, |
|
"loss": 4.998, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9157509157509157, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.5765438950997703e-06, |
|
"loss": 4.992, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9340659340659341, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 1.5607485309181812e-06, |
|
"loss": 4.9831, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 1.544746396537651e-06, |
|
"loss": 4.9835, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9706959706959707, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.5285433925790945e-06, |
|
"loss": 4.962, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.989010989010989, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 1.5121454937319975e-06, |
|
"loss": 4.9708, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0073260073260073, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 1.4955587465513128e-06, |
|
"loss": 4.9494, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1.4787892672278555e-06, |
|
"loss": 4.9172, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.043956043956044, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.461843239333021e-06, |
|
"loss": 4.9262, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.0622710622710623, |
|
"grad_norm": 3.75, |
|
"learning_rate": 1.444726911538657e-06, |
|
"loss": 4.9115, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.0805860805860805, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.4274465953129325e-06, |
|
"loss": 4.8926, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.098901098901099, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.4100086625930462e-06, |
|
"loss": 4.8845, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.098901098901099, |
|
"eval_loss": 4.895308017730713, |
|
"eval_runtime": 14.5245, |
|
"eval_samples_per_second": 36.421, |
|
"eval_steps_per_second": 1.17, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.1172161172161172, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.3924195434356441e-06, |
|
"loss": 4.8754, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.1355311355311355, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.3746857236458005e-06, |
|
"loss": 4.8566, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 3.875, |
|
"learning_rate": 1.3568137423854457e-06, |
|
"loss": 4.873, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.1721611721611722, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.3388101897621183e-06, |
|
"loss": 4.8476, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.32068170439893e-06, |
|
"loss": 4.8846, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.2087912087912087, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 1.3024349709866448e-06, |
|
"loss": 4.864, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2271062271062272, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.2840767178187654e-06, |
|
"loss": 4.8457, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.2454212454212454, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 1.265613714310548e-06, |
|
"loss": 4.8469, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.2637362637362637, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.2470527685028482e-06, |
|
"loss": 4.8406, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.282051282051282, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.228400724551728e-06, |
|
"loss": 4.8545, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.3003663003663004, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.2096644602047445e-06, |
|
"loss": 4.8236, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.3186813186813187, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.1908508842648505e-06, |
|
"loss": 4.8126, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.3369963369963371, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 1.171966934042847e-06, |
|
"loss": 4.8038, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.3553113553113554, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.1530195727993199e-06, |
|
"loss": 4.8278, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.3736263736263736, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.1340157871770115e-06, |
|
"loss": 4.8217, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.3919413919413919, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.1149625846245681e-06, |
|
"loss": 4.7912, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.4102564102564101, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.095866990812615e-06, |
|
"loss": 4.8004, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.0767360470431157e-06, |
|
"loss": 4.7896, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.4468864468864469, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.0575768076529625e-06, |
|
"loss": 4.7788, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.4652014652014653, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.0383963374127645e-06, |
|
"loss": 4.8106, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.4835164835164836, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 1.0192017089217861e-06, |
|
"loss": 4.7749, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.5018315018315018, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1e-06, |
|
"loss": 4.7864, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.52014652014652, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 9.80798291078214e-07, |
|
"loss": 4.7869, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 3.625, |
|
"learning_rate": 9.616036625872356e-07, |
|
"loss": 4.8314, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5567765567765568, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 9.424231923470376e-07, |
|
"loss": 4.7831, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.575091575091575, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 9.232639529568842e-07, |
|
"loss": 4.7636, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.5934065934065935, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 9.041330091873851e-07, |
|
"loss": 4.7465, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.6117216117216118, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 8.850374153754321e-07, |
|
"loss": 4.7616, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.63003663003663, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 8.659842128229886e-07, |
|
"loss": 4.7542, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.6483516483516483, |
|
"grad_norm": 3.5, |
|
"learning_rate": 8.4698042720068e-07, |
|
"loss": 4.745, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 8.280330659571531e-07, |
|
"loss": 4.738, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.684981684981685, |
|
"grad_norm": 3.75, |
|
"learning_rate": 8.091491157351493e-07, |
|
"loss": 4.7335, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.7032967032967035, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 7.903355397952556e-07, |
|
"loss": 4.7792, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.7216117216117217, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 7.715992754482718e-07, |
|
"loss": 4.7288, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.73992673992674, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 7.529472314971522e-07, |
|
"loss": 4.7489, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.7582417582417582, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 7.34386285689452e-07, |
|
"loss": 4.7163, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.7765567765567765, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 7.159232821812347e-07, |
|
"loss": 4.7201, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.7948717948717947, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 6.975650290133554e-07, |
|
"loss": 4.7334, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.8131868131868132, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 6.793182956010699e-07, |
|
"loss": 4.7339, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.8315018315018317, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 6.611898102378818e-07, |
|
"loss": 4.7384, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.84981684981685, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 6.431862576145544e-07, |
|
"loss": 4.7402, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.8681318681318682, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 6.253142763541995e-07, |
|
"loss": 4.723, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.8864468864468864, |
|
"grad_norm": 3.5, |
|
"learning_rate": 6.075804565643561e-07, |
|
"loss": 4.7457, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 5.899913374069538e-07, |
|
"loss": 4.712, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 5.725534046870677e-07, |
|
"loss": 4.7273, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.9413919413919414, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 5.552730884613428e-07, |
|
"loss": 4.6981, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.9597069597069599, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 5.381567606669793e-07, |
|
"loss": 4.7482, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.978021978021978, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 5.212107327721445e-07, |
|
"loss": 4.718, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.9963369963369964, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 5.044412534486873e-07, |
|
"loss": 4.7148, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.0146520146520146, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 4.878545062680026e-07, |
|
"loss": 4.6767, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.032967032967033, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 4.7145660742090575e-07, |
|
"loss": 4.706, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 4.0, |
|
"learning_rate": 4.5525360346234907e-07, |
|
"loss": 4.7142, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.06959706959707, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 4.392514690818193e-07, |
|
"loss": 4.7053, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.087912087912088, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 4.2345610490022996e-07, |
|
"loss": 4.6978, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.1062271062271063, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 4.078733352941321e-07, |
|
"loss": 4.7147, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.1245421245421245, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 3.925089062480339e-07, |
|
"loss": 4.6861, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 3.77368483235628e-07, |
|
"loss": 4.725, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.161172161172161, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 3.6245764913070875e-07, |
|
"loss": 4.7223, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.1794871794871793, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 3.477819021485447e-07, |
|
"loss": 4.7254, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.197802197802198, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 3.333466538184674e-07, |
|
"loss": 4.7074, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.197802197802198, |
|
"eval_loss": 4.718916893005371, |
|
"eval_runtime": 14.5284, |
|
"eval_samples_per_second": 36.412, |
|
"eval_steps_per_second": 1.17, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.2161172161172162, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 3.1915722698842874e-07, |
|
"loss": 4.705, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.2344322344322345, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 3.0521885386225344e-07, |
|
"loss": 4.7118, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.2527472527472527, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 2.9153667407032066e-07, |
|
"loss": 4.7285, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.271062271062271, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 2.7811573277437603e-07, |
|
"loss": 4.681, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.2893772893772892, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 2.649609788071836e-07, |
|
"loss": 4.707, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 2.520772628476919e-07, |
|
"loss": 4.7278, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.326007326007326, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 2.394693356323997e-07, |
|
"loss": 4.7202, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.3443223443223444, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 2.2714184620356826e-07, |
|
"loss": 4.706, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.3626373626373627, |
|
"grad_norm": 3.5, |
|
"learning_rate": 2.150993401949376e-07, |
|
"loss": 4.732, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 2.0334625815557026e-07, |
|
"loss": 4.6981, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.399267399267399, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.9188693391244438e-07, |
|
"loss": 4.7334, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.4175824175824174, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.8072559297240097e-07, |
|
"loss": 4.7284, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.435897435897436, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.6986635096403212e-07, |
|
"loss": 4.7189, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.4542124542124544, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.5931321212008465e-07, |
|
"loss": 4.7007, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.4725274725274726, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.490700678009421e-07, |
|
"loss": 4.6923, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.490842490842491, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.3914069505972482e-07, |
|
"loss": 4.7265, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.509157509157509, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.2952875524954232e-07, |
|
"loss": 4.7318, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.5274725274725274, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.2023779267340563e-07, |
|
"loss": 4.7144, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.5457875457875456, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.112712332773038e-07, |
|
"loss": 4.6979, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.026323833869206e-07, |
|
"loss": 4.6933, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.5824175824175826, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 9.432442848846289e-08, |
|
"loss": 4.731, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.600732600732601, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 8.63504320540438e-08, |
|
"loss": 4.7246, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 7.871333441206052e-08, |
|
"loss": 4.7311, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.6373626373626373, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 7.141595166297832e-08, |
|
"loss": 4.7213, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.6556776556776556, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 6.446097464092248e-08, |
|
"loss": 4.6983, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.6739926739926743, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 5.78509679214616e-08, |
|
"loss": 4.6839, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 5.1588368875946864e-08, |
|
"loss": 4.7098, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.7106227106227108, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 4.567548677275601e-08, |
|
"loss": 4.6954, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.728937728937729, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 4.0114501925775925e-08, |
|
"loss": 4.7284, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.7472527472527473, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 3.490746489043317e-08, |
|
"loss": 4.7099, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.7655677655677655, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 3.005629570757373e-08, |
|
"loss": 4.7144, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.7838827838827838, |
|
"grad_norm": 3.625, |
|
"learning_rate": 2.5562783195467675e-08, |
|
"loss": 4.6949, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.802197802197802, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 2.1428584290201114e-08, |
|
"loss": 4.7045, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.8205128205128203, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1.7655223434698053e-08, |
|
"loss": 4.688, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.838827838827839, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.4244092016597931e-08, |
|
"loss": 4.7177, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.11964478551958e-08, |
|
"loss": 4.7073, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.8754578754578755, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 8.513414737635005e-09, |
|
"loss": 4.7176, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.8937728937728937, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 6.1959820045215385e-09, |
|
"loss": 4.6779, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.912087912087912, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 4.245004185115752e-09, |
|
"loss": 4.7052, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.9304029304029307, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 2.661200682232745e-09, |
|
"loss": 4.7179, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.948717948717949, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.4451555069708853e-09, |
|
"loss": 4.6552, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.967032967032967, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 5.973170633631897e-10, |
|
"loss": 4.6979, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.9853479853479854, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 1.1799798303335772e-10, |
|
"loss": 4.7113, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 819, |
|
"total_flos": 5.361516984661967e+18, |
|
"train_loss": 5.0057218712328115, |
|
"train_runtime": 5088.8536, |
|
"train_samples_per_second": 10.267, |
|
"train_steps_per_second": 0.161 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 819, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.361516984661967e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|