imdatta0's picture
End of training
8b1b653 verified
{
"best_metric": 2.031247138977051,
"best_model_checkpoint": "/home/datta0/models/lora_final/Mistral-7B-v0.3_pct_reverse_r32/checkpoint-8",
"epoch": 0.9903687285915777,
"eval_steps": 8,
"global_step": 384,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025790852307072333,
"grad_norm": 23.795961380004883,
"learning_rate": 1.25e-05,
"loss": 2.2237,
"step": 1
},
{
"epoch": 0.010316340922828933,
"grad_norm": 10.762740135192871,
"learning_rate": 5e-05,
"loss": 2.0557,
"step": 4
},
{
"epoch": 0.020632681845657867,
"grad_norm": 12.864238739013672,
"learning_rate": 0.0001,
"loss": 1.991,
"step": 8
},
{
"epoch": 0.020632681845657867,
"eval_loss": 2.031247138977051,
"eval_runtime": 50.8376,
"eval_samples_per_second": 4.839,
"eval_steps_per_second": 4.839,
"step": 8
},
{
"epoch": 0.0309490227684868,
"grad_norm": 20.236906051635742,
"learning_rate": 9.997251843068762e-05,
"loss": 2.0347,
"step": 12
},
{
"epoch": 0.04126536369131573,
"grad_norm": 11.911772727966309,
"learning_rate": 9.989010393221656e-05,
"loss": 2.0461,
"step": 16
},
{
"epoch": 0.04126536369131573,
"eval_loss": 2.0334763526916504,
"eval_runtime": 69.6741,
"eval_samples_per_second": 3.531,
"eval_steps_per_second": 3.531,
"step": 16
},
{
"epoch": 0.05158170461414467,
"grad_norm": 13.50040340423584,
"learning_rate": 9.97528470997769e-05,
"loss": 1.995,
"step": 20
},
{
"epoch": 0.0618980455369736,
"grad_norm": 12.473864555358887,
"learning_rate": 9.956089881469482e-05,
"loss": 2.0456,
"step": 24
},
{
"epoch": 0.0618980455369736,
"eval_loss": 2.060091018676758,
"eval_runtime": 49.4489,
"eval_samples_per_second": 4.975,
"eval_steps_per_second": 4.975,
"step": 24
},
{
"epoch": 0.07221438645980253,
"grad_norm": 16.583091735839844,
"learning_rate": 9.931447007857432e-05,
"loss": 2.0845,
"step": 28
},
{
"epoch": 0.08253072738263147,
"grad_norm": 38.525352478027344,
"learning_rate": 9.901383178135113e-05,
"loss": 2.0584,
"step": 32
},
{
"epoch": 0.08253072738263147,
"eval_loss": 2.087923765182495,
"eval_runtime": 68.1674,
"eval_samples_per_second": 3.609,
"eval_steps_per_second": 3.609,
"step": 32
},
{
"epoch": 0.09284706830546041,
"grad_norm": 18.212785720825195,
"learning_rate": 9.865931440351337e-05,
"loss": 2.0671,
"step": 36
},
{
"epoch": 0.10316340922828934,
"grad_norm": 18.464550018310547,
"learning_rate": 9.825130765281668e-05,
"loss": 2.1123,
"step": 40
},
{
"epoch": 0.10316340922828934,
"eval_loss": 2.080946922302246,
"eval_runtime": 51.3512,
"eval_samples_per_second": 4.791,
"eval_steps_per_second": 4.791,
"step": 40
},
{
"epoch": 0.11347975015111827,
"grad_norm": 18.683780670166016,
"learning_rate": 9.779026003589304e-05,
"loss": 2.0638,
"step": 44
},
{
"epoch": 0.1237960910739472,
"grad_norm": 20.039987564086914,
"learning_rate": 9.727667836522407e-05,
"loss": 2.0666,
"step": 48
},
{
"epoch": 0.1237960910739472,
"eval_loss": 2.0889713764190674,
"eval_runtime": 49.6263,
"eval_samples_per_second": 4.957,
"eval_steps_per_second": 4.957,
"step": 48
},
{
"epoch": 0.13411243199677614,
"grad_norm": 23.5998477935791,
"learning_rate": 9.6711127202021e-05,
"loss": 2.1188,
"step": 52
},
{
"epoch": 0.14442877291960507,
"grad_norm": 16.58580780029297,
"learning_rate": 9.609422823562345e-05,
"loss": 2.0733,
"step": 56
},
{
"epoch": 0.14442877291960507,
"eval_loss": 2.0954222679138184,
"eval_runtime": 68.3777,
"eval_samples_per_second": 3.598,
"eval_steps_per_second": 3.598,
"step": 56
},
{
"epoch": 0.154745113842434,
"grad_norm": 17.798566818237305,
"learning_rate": 9.542665960009959e-05,
"loss": 2.0855,
"step": 60
},
{
"epoch": 0.16506145476526293,
"grad_norm": 16.25016212463379,
"learning_rate": 9.470915512879852e-05,
"loss": 2.1236,
"step": 64
},
{
"epoch": 0.16506145476526293,
"eval_loss": 2.0970985889434814,
"eval_runtime": 50.5579,
"eval_samples_per_second": 4.866,
"eval_steps_per_second": 4.866,
"step": 64
},
{
"epoch": 0.1753777956880919,
"grad_norm": 14.97128677368164,
"learning_rate": 9.394250354767467e-05,
"loss": 2.104,
"step": 68
},
{
"epoch": 0.18569413661092082,
"grad_norm": 18.22063636779785,
"learning_rate": 9.312754760827061e-05,
"loss": 2.1103,
"step": 72
},
{
"epoch": 0.18569413661092082,
"eval_loss": 2.1007986068725586,
"eval_runtime": 68.15,
"eval_samples_per_second": 3.61,
"eval_steps_per_second": 3.61,
"step": 72
},
{
"epoch": 0.19601047753374976,
"grad_norm": 15.88463306427002,
"learning_rate": 9.226518316131176e-05,
"loss": 2.1123,
"step": 76
},
{
"epoch": 0.2063268184565787,
"grad_norm": 16.19556999206543,
"learning_rate": 9.1356358171931e-05,
"loss": 2.0876,
"step": 80
},
{
"epoch": 0.2063268184565787,
"eval_loss": 2.104221820831299,
"eval_runtime": 51.0209,
"eval_samples_per_second": 4.822,
"eval_steps_per_second": 4.822,
"step": 80
},
{
"epoch": 0.21664315937940762,
"grad_norm": 21.194753646850586,
"learning_rate": 9.040207167760586e-05,
"loss": 2.0851,
"step": 84
},
{
"epoch": 0.22695950030223655,
"grad_norm": 16.5502986907959,
"learning_rate": 8.940337268995385e-05,
"loss": 2.1107,
"step": 88
},
{
"epoch": 0.22695950030223655,
"eval_loss": 2.115513324737549,
"eval_runtime": 71.0739,
"eval_samples_per_second": 3.461,
"eval_steps_per_second": 3.461,
"step": 88
},
{
"epoch": 0.23727584122506548,
"grad_norm": 16.366037368774414,
"learning_rate": 8.836135904159302e-05,
"loss": 2.1165,
"step": 92
},
{
"epoch": 0.2475921821478944,
"grad_norm": 19.07808494567871,
"learning_rate": 8.727717617933544e-05,
"loss": 2.0889,
"step": 96
},
{
"epoch": 0.2475921821478944,
"eval_loss": 2.1082587242126465,
"eval_runtime": 51.4649,
"eval_samples_per_second": 4.78,
"eval_steps_per_second": 4.78,
"step": 96
},
{
"epoch": 0.25790852307072337,
"grad_norm": 16.868947982788086,
"learning_rate": 8.615201590504017e-05,
"loss": 2.1333,
"step": 100
},
{
"epoch": 0.2682248639935523,
"grad_norm": 18.22587013244629,
"learning_rate": 8.498711506550983e-05,
"loss": 2.097,
"step": 104
},
{
"epoch": 0.2682248639935523,
"eval_loss": 2.1185896396636963,
"eval_runtime": 50.4577,
"eval_samples_per_second": 4.875,
"eval_steps_per_second": 4.875,
"step": 104
},
{
"epoch": 0.27854120491638124,
"grad_norm": 17.453964233398438,
"learning_rate": 8.378375419287099e-05,
"loss": 2.1481,
"step": 108
},
{
"epoch": 0.28885754583921014,
"grad_norm": 15.08243465423584,
"learning_rate": 8.25432560969328e-05,
"loss": 2.0962,
"step": 112
},
{
"epoch": 0.28885754583921014,
"eval_loss": 2.120164394378662,
"eval_runtime": 68.8529,
"eval_samples_per_second": 3.573,
"eval_steps_per_second": 3.573,
"step": 112
},
{
"epoch": 0.2991738867620391,
"grad_norm": 15.695137977600098,
"learning_rate": 8.126698441107146e-05,
"loss": 2.1125,
"step": 116
},
{
"epoch": 0.309490227684868,
"grad_norm": 28.060436248779297,
"learning_rate": 7.995634209323886e-05,
"loss": 2.1415,
"step": 120
},
{
"epoch": 0.309490227684868,
"eval_loss": 2.1305339336395264,
"eval_runtime": 50.6817,
"eval_samples_per_second": 4.854,
"eval_steps_per_second": 4.854,
"step": 120
},
{
"epoch": 0.31980656860769696,
"grad_norm": 17.550668716430664,
"learning_rate": 7.861276988374302e-05,
"loss": 2.141,
"step": 124
},
{
"epoch": 0.33012290953052587,
"grad_norm": 20.923986434936523,
"learning_rate": 7.723774472149601e-05,
"loss": 2.1294,
"step": 128
},
{
"epoch": 0.33012290953052587,
"eval_loss": 2.1169350147247314,
"eval_runtime": 68.8987,
"eval_samples_per_second": 3.57,
"eval_steps_per_second": 3.57,
"step": 128
},
{
"epoch": 0.3404392504533548,
"grad_norm": 18.64177131652832,
"learning_rate": 7.583277812046993e-05,
"loss": 2.1473,
"step": 132
},
{
"epoch": 0.3507555913761838,
"grad_norm": 20.397233963012695,
"learning_rate": 7.439941450814591e-05,
"loss": 2.1476,
"step": 136
},
{
"epoch": 0.3507555913761838,
"eval_loss": 2.13000226020813,
"eval_runtime": 52.251,
"eval_samples_per_second": 4.708,
"eval_steps_per_second": 4.708,
"step": 136
},
{
"epoch": 0.3610719322990127,
"grad_norm": 14.290401458740234,
"learning_rate": 7.293922952778239e-05,
"loss": 2.1519,
"step": 140
},
{
"epoch": 0.37138827322184165,
"grad_norm": 19.956518173217773,
"learning_rate": 7.145382830636924e-05,
"loss": 2.1725,
"step": 144
},
{
"epoch": 0.37138827322184165,
"eval_loss": 2.124537467956543,
"eval_runtime": 66.2352,
"eval_samples_per_second": 3.714,
"eval_steps_per_second": 3.714,
"step": 144
},
{
"epoch": 0.38170461414467055,
"grad_norm": 18.11756134033203,
"learning_rate": 6.994484369017143e-05,
"loss": 2.131,
"step": 148
},
{
"epoch": 0.3920209550674995,
"grad_norm": 15.948026657104492,
"learning_rate": 6.841393444980177e-05,
"loss": 2.1159,
"step": 152
},
{
"epoch": 0.3920209550674995,
"eval_loss": 2.117161273956299,
"eval_runtime": 65.9204,
"eval_samples_per_second": 3.732,
"eval_steps_per_second": 3.732,
"step": 152
},
{
"epoch": 0.4023372959903284,
"grad_norm": 17.677406311035156,
"learning_rate": 6.686278345679625e-05,
"loss": 2.0999,
"step": 156
},
{
"epoch": 0.4126536369131574,
"grad_norm": 18.509098052978516,
"learning_rate": 6.529309583369605e-05,
"loss": 2.0921,
"step": 160
},
{
"epoch": 0.4126536369131574,
"eval_loss": 2.122069835662842,
"eval_runtime": 52.1961,
"eval_samples_per_second": 4.713,
"eval_steps_per_second": 4.713,
"step": 160
},
{
"epoch": 0.4229699778359863,
"grad_norm": 19.927303314208984,
"learning_rate": 6.370659707966967e-05,
"loss": 2.1235,
"step": 164
},
{
"epoch": 0.43328631875881524,
"grad_norm": 17.947538375854492,
"learning_rate": 6.2105031173736e-05,
"loss": 2.141,
"step": 168
},
{
"epoch": 0.43328631875881524,
"eval_loss": 2.1334285736083984,
"eval_runtime": 66.1797,
"eval_samples_per_second": 3.717,
"eval_steps_per_second": 3.717,
"step": 168
},
{
"epoch": 0.44360265968164414,
"grad_norm": 17.444217681884766,
"learning_rate": 6.049015865767318e-05,
"loss": 2.0906,
"step": 172
},
{
"epoch": 0.4539190006044731,
"grad_norm": 16.887006759643555,
"learning_rate": 5.88637547007204e-05,
"loss": 2.1312,
"step": 176
},
{
"epoch": 0.4539190006044731,
"eval_loss": 2.125943183898926,
"eval_runtime": 50.8568,
"eval_samples_per_second": 4.837,
"eval_steps_per_second": 4.837,
"step": 176
},
{
"epoch": 0.46423534152730206,
"grad_norm": 16.4300537109375,
"learning_rate": 5.722760714820057e-05,
"loss": 2.1426,
"step": 180
},
{
"epoch": 0.47455168245013096,
"grad_norm": 17.936267852783203,
"learning_rate": 5.5583514556208514e-05,
"loss": 2.106,
"step": 184
},
{
"epoch": 0.47455168245013096,
"eval_loss": 2.126936912536621,
"eval_runtime": 72.4228,
"eval_samples_per_second": 3.397,
"eval_steps_per_second": 3.397,
"step": 184
},
{
"epoch": 0.4848680233729599,
"grad_norm": 15.06416130065918,
"learning_rate": 5.393328421452514e-05,
"loss": 2.1054,
"step": 188
},
{
"epoch": 0.4951843642957888,
"grad_norm": 16.073657989501953,
"learning_rate": 5.2278730159931076e-05,
"loss": 2.1015,
"step": 192
},
{
"epoch": 0.4951843642957888,
"eval_loss": 2.1197259426116943,
"eval_runtime": 50.2504,
"eval_samples_per_second": 4.895,
"eval_steps_per_second": 4.895,
"step": 192
},
{
"epoch": 0.5055007052186178,
"grad_norm": 18.07906723022461,
"learning_rate": 5.062167118210367e-05,
"loss": 2.1588,
"step": 196
},
{
"epoch": 0.5158170461414467,
"grad_norm": 16.21997833251953,
"learning_rate": 4.896392882428901e-05,
"loss": 2.1368,
"step": 200
},
{
"epoch": 0.5158170461414467,
"eval_loss": 2.1163711547851562,
"eval_runtime": 51.0039,
"eval_samples_per_second": 4.823,
"eval_steps_per_second": 4.823,
"step": 200
},
{
"epoch": 0.5261333870642756,
"grad_norm": 17.681896209716797,
"learning_rate": 4.730732538094749e-05,
"loss": 2.1108,
"step": 204
},
{
"epoch": 0.5364497279871046,
"grad_norm": 14.08202838897705,
"learning_rate": 4.565368189457313e-05,
"loss": 2.0751,
"step": 208
},
{
"epoch": 0.5364497279871046,
"eval_loss": 2.1104061603546143,
"eval_runtime": 70.8566,
"eval_samples_per_second": 3.472,
"eval_steps_per_second": 3.472,
"step": 208
},
{
"epoch": 0.5467660689099335,
"grad_norm": 17.15287971496582,
"learning_rate": 4.400481615388948e-05,
"loss": 2.0737,
"step": 212
},
{
"epoch": 0.5570824098327625,
"grad_norm": 14.733731269836426,
"learning_rate": 4.236254069562213e-05,
"loss": 2.135,
"step": 216
},
{
"epoch": 0.5570824098327625,
"eval_loss": 2.1105358600616455,
"eval_runtime": 50.3866,
"eval_samples_per_second": 4.882,
"eval_steps_per_second": 4.882,
"step": 216
},
{
"epoch": 0.5673987507555914,
"grad_norm": 12.141791343688965,
"learning_rate": 4.0728660812044536e-05,
"loss": 2.1395,
"step": 220
},
{
"epoch": 0.5777150916784203,
"grad_norm": 13.20874309539795,
"learning_rate": 3.910497256648742e-05,
"loss": 2.0718,
"step": 224
},
{
"epoch": 0.5777150916784203,
"eval_loss": 2.100306749343872,
"eval_runtime": 68.0529,
"eval_samples_per_second": 3.615,
"eval_steps_per_second": 3.615,
"step": 224
},
{
"epoch": 0.5880314326012492,
"grad_norm": 14.548846244812012,
"learning_rate": 3.749326081899329e-05,
"loss": 2.0944,
"step": 228
},
{
"epoch": 0.5983477735240782,
"grad_norm": 15.402515411376953,
"learning_rate": 3.589529726428615e-05,
"loss": 2.0393,
"step": 232
},
{
"epoch": 0.5983477735240782,
"eval_loss": 2.1025285720825195,
"eval_runtime": 51.9996,
"eval_samples_per_second": 4.731,
"eval_steps_per_second": 4.731,
"step": 232
},
{
"epoch": 0.6086641144469072,
"grad_norm": 13.062746047973633,
"learning_rate": 3.431283848421347e-05,
"loss": 2.1215,
"step": 236
},
{
"epoch": 0.618980455369736,
"grad_norm": 16.821453094482422,
"learning_rate": 3.274762401680124e-05,
"loss": 2.1034,
"step": 240
},
{
"epoch": 0.618980455369736,
"eval_loss": 2.0945794582366943,
"eval_runtime": 69.0514,
"eval_samples_per_second": 3.563,
"eval_steps_per_second": 3.563,
"step": 240
},
{
"epoch": 0.629296796292565,
"grad_norm": 13.247776985168457,
"learning_rate": 3.120137444404442e-05,
"loss": 2.0619,
"step": 244
},
{
"epoch": 0.6396131372153939,
"grad_norm": 12.850509643554688,
"learning_rate": 2.9675789500535328e-05,
"loss": 2.045,
"step": 248
},
{
"epoch": 0.6396131372153939,
"eval_loss": 2.093926191329956,
"eval_runtime": 49.7964,
"eval_samples_per_second": 4.94,
"eval_steps_per_second": 4.94,
"step": 248
},
{
"epoch": 0.6499294781382229,
"grad_norm": 15.139781951904297,
"learning_rate": 2.8172546205008683e-05,
"loss": 2.1207,
"step": 252
},
{
"epoch": 0.6602458190610517,
"grad_norm": 13.269444465637207,
"learning_rate": 2.6693297016857188e-05,
"loss": 2.077,
"step": 256
},
{
"epoch": 0.6602458190610517,
"eval_loss": 2.081383466720581,
"eval_runtime": 52.4381,
"eval_samples_per_second": 4.691,
"eval_steps_per_second": 4.691,
"step": 256
},
{
"epoch": 0.6705621599838807,
"grad_norm": 11.657261848449707,
"learning_rate": 2.523966801964468e-05,
"loss": 2.0693,
"step": 260
},
{
"epoch": 0.6808785009067096,
"grad_norm": 11.285372734069824,
"learning_rate": 2.3813257133612827e-05,
"loss": 2.0514,
"step": 264
},
{
"epoch": 0.6808785009067096,
"eval_loss": 2.0800182819366455,
"eval_runtime": 66.6605,
"eval_samples_per_second": 3.69,
"eval_steps_per_second": 3.69,
"step": 264
},
{
"epoch": 0.6911948418295386,
"grad_norm": 13.049285888671875,
"learning_rate": 2.2415632359146856e-05,
"loss": 2.0855,
"step": 268
},
{
"epoch": 0.7015111827523676,
"grad_norm": 11.901713371276855,
"learning_rate": 2.104833005313131e-05,
"loss": 2.0222,
"step": 272
},
{
"epoch": 0.7015111827523676,
"eval_loss": 2.0774030685424805,
"eval_runtime": 51.2508,
"eval_samples_per_second": 4.8,
"eval_steps_per_second": 4.8,
"step": 272
},
{
"epoch": 0.7118275236751964,
"grad_norm": 12.342489242553711,
"learning_rate": 1.971285324008994e-05,
"loss": 2.1345,
"step": 276
},
{
"epoch": 0.7221438645980254,
"grad_norm": 13.865422248840332,
"learning_rate": 1.84106699599668e-05,
"loss": 2.075,
"step": 280
},
{
"epoch": 0.7221438645980254,
"eval_loss": 2.074871301651001,
"eval_runtime": 63.1191,
"eval_samples_per_second": 3.897,
"eval_steps_per_second": 3.897,
"step": 280
},
{
"epoch": 0.7324602055208543,
"grad_norm": 12.347137451171875,
"learning_rate": 1.7143211654364762e-05,
"loss": 2.0861,
"step": 284
},
{
"epoch": 0.7427765464436833,
"grad_norm": 15.943358421325684,
"learning_rate": 1.5911871593014837e-05,
"loss": 2.1013,
"step": 288
},
{
"epoch": 0.7427765464436833,
"eval_loss": 2.070469379425049,
"eval_runtime": 50.0079,
"eval_samples_per_second": 4.919,
"eval_steps_per_second": 4.919,
"step": 288
},
{
"epoch": 0.7530928873665121,
"grad_norm": 11.594297409057617,
"learning_rate": 1.4718003342206722e-05,
"loss": 2.053,
"step": 292
},
{
"epoch": 0.7634092282893411,
"grad_norm": 13.693614959716797,
"learning_rate": 1.3562919276863844e-05,
"loss": 2.0929,
"step": 296
},
{
"epoch": 0.7634092282893411,
"eval_loss": 2.064333438873291,
"eval_runtime": 51.108,
"eval_samples_per_second": 4.813,
"eval_steps_per_second": 4.813,
"step": 296
},
{
"epoch": 0.7737255692121701,
"grad_norm": 12.406514167785645,
"learning_rate": 1.2447889137898293e-05,
"loss": 2.1147,
"step": 300
},
{
"epoch": 0.784041910134999,
"grad_norm": 14.044355392456055,
"learning_rate": 1.1374138636432053e-05,
"loss": 2.0996,
"step": 304
},
{
"epoch": 0.784041910134999,
"eval_loss": 2.0692458152770996,
"eval_runtime": 67.0744,
"eval_samples_per_second": 3.668,
"eval_steps_per_second": 3.668,
"step": 304
},
{
"epoch": 0.794358251057828,
"grad_norm": 10.247350692749023,
"learning_rate": 1.0342848106418368e-05,
"loss": 2.0892,
"step": 308
},
{
"epoch": 0.8046745919806568,
"grad_norm": 10.73900032043457,
"learning_rate": 9.35515120714447e-06,
"loss": 2.0507,
"step": 312
},
{
"epoch": 0.8046745919806568,
"eval_loss": 2.0587587356567383,
"eval_runtime": 49.942,
"eval_samples_per_second": 4.926,
"eval_steps_per_second": 4.926,
"step": 312
},
{
"epoch": 0.8149909329034858,
"grad_norm": 10.756080627441406,
"learning_rate": 8.41213367704224e-06,
"loss": 2.0867,
"step": 316
},
{
"epoch": 0.8253072738263147,
"grad_norm": 14.330180168151855,
"learning_rate": 7.51483214017637e-06,
"loss": 2.0353,
"step": 320
},
{
"epoch": 0.8253072738263147,
"eval_loss": 2.0574405193328857,
"eval_runtime": 68.5693,
"eval_samples_per_second": 3.588,
"eval_steps_per_second": 3.588,
"step": 320
},
{
"epoch": 0.8356236147491437,
"grad_norm": 10.09432601928711,
"learning_rate": 6.664232966721995e-06,
"loss": 2.0535,
"step": 324
},
{
"epoch": 0.8459399556719726,
"grad_norm": 12.156997680664062,
"learning_rate": 5.8612711886848196e-06,
"loss": 2.0128,
"step": 328
},
{
"epoch": 0.8459399556719726,
"eval_loss": 2.056994915008545,
"eval_runtime": 49.8867,
"eval_samples_per_second": 4.931,
"eval_steps_per_second": 4.931,
"step": 328
},
{
"epoch": 0.8562562965948015,
"grad_norm": 13.915254592895508,
"learning_rate": 5.106829472055202e-06,
"loss": 2.0233,
"step": 332
},
{
"epoch": 0.8665726375176305,
"grad_norm": 14.312435150146484,
"learning_rate": 4.401737146526219e-06,
"loss": 2.0508,
"step": 336
},
{
"epoch": 0.8665726375176305,
"eval_loss": 2.050326108932495,
"eval_runtime": 67.6182,
"eval_samples_per_second": 3.638,
"eval_steps_per_second": 3.638,
"step": 336
},
{
"epoch": 0.8768889784404594,
"grad_norm": 12.76665210723877,
"learning_rate": 3.7467692938425057e-06,
"loss": 2.0105,
"step": 340
},
{
"epoch": 0.8872053193632883,
"grad_norm": 17.83547019958496,
"learning_rate": 3.142645895781715e-06,
"loss": 2.067,
"step": 344
},
{
"epoch": 0.8872053193632883,
"eval_loss": 2.0471653938293457,
"eval_runtime": 157.8315,
"eval_samples_per_second": 1.559,
"eval_steps_per_second": 1.559,
"step": 344
},
{
"epoch": 0.8975216602861172,
"grad_norm": 10.910913467407227,
"learning_rate": 2.5900310427053044e-06,
"loss": 2.0453,
"step": 348
},
{
"epoch": 0.9078380012089462,
"grad_norm": 9.634235382080078,
"learning_rate": 2.089532203548794e-06,
"loss": 2.0821,
"step": 352
},
{
"epoch": 0.9078380012089462,
"eval_loss": 2.047574281692505,
"eval_runtime": 229.272,
"eval_samples_per_second": 1.073,
"eval_steps_per_second": 1.073,
"step": 352
},
{
"epoch": 0.9181543421317752,
"grad_norm": 11.296228408813477,
"learning_rate": 1.6416995580537664e-06,
"loss": 2.0581,
"step": 356
},
{
"epoch": 0.9284706830546041,
"grad_norm": 12.595250129699707,
"learning_rate": 1.247025391975698e-06,
"loss": 2.0461,
"step": 360
},
{
"epoch": 0.9284706830546041,
"eval_loss": 2.04707932472229,
"eval_runtime": 278.4472,
"eval_samples_per_second": 0.883,
"eval_steps_per_second": 0.883,
"step": 360
},
{
"epoch": 0.938787023977433,
"grad_norm": 11.871248245239258,
"learning_rate": 9.059435559326257e-07,
"loss": 2.0658,
"step": 364
},
{
"epoch": 0.9491033649002619,
"grad_norm": 11.920108795166016,
"learning_rate": 6.188289884893062e-07,
"loss": 2.0666,
"step": 368
},
{
"epoch": 0.9491033649002619,
"eval_loss": 2.046149492263794,
"eval_runtime": 241.2948,
"eval_samples_per_second": 1.019,
"eval_steps_per_second": 1.019,
"step": 368
},
{
"epoch": 0.9594197058230909,
"grad_norm": 12.083566665649414,
"learning_rate": 3.8599730400115107e-07,
"loss": 2.0008,
"step": 372
},
{
"epoch": 0.9697360467459198,
"grad_norm": 11.261472702026367,
"learning_rate": 2.0770444567118075e-07,
"loss": 2.0639,
"step": 376
},
{
"epoch": 0.9697360467459198,
"eval_loss": 2.0458359718322754,
"eval_runtime": 377.6181,
"eval_samples_per_second": 0.651,
"eval_steps_per_second": 0.651,
"step": 376
},
{
"epoch": 0.9800523876687487,
"grad_norm": 10.003210067749023,
"learning_rate": 8.414640420116305e-08,
"loss": 2.0595,
"step": 380
},
{
"epoch": 0.9903687285915777,
"grad_norm": 9.608034133911133,
"learning_rate": 1.5459002346324135e-08,
"loss": 1.9859,
"step": 384
},
{
"epoch": 0.9903687285915777,
"eval_loss": 2.04579496383667,
"eval_runtime": 243.787,
"eval_samples_per_second": 1.009,
"eval_steps_per_second": 1.009,
"step": 384
}
],
"logging_steps": 4,
"max_steps": 387,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 8,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.501033446014976e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}