{ "best_metric": 2.031247138977051, "best_model_checkpoint": "/home/datta0/models/lora_final/Mistral-7B-v0.3_pct_reverse_r32/checkpoint-8", "epoch": 0.9903687285915777, "eval_steps": 8, "global_step": 384, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025790852307072333, "grad_norm": 23.795961380004883, "learning_rate": 1.25e-05, "loss": 2.2237, "step": 1 }, { "epoch": 0.010316340922828933, "grad_norm": 10.762740135192871, "learning_rate": 5e-05, "loss": 2.0557, "step": 4 }, { "epoch": 0.020632681845657867, "grad_norm": 12.864238739013672, "learning_rate": 0.0001, "loss": 1.991, "step": 8 }, { "epoch": 0.020632681845657867, "eval_loss": 2.031247138977051, "eval_runtime": 50.8376, "eval_samples_per_second": 4.839, "eval_steps_per_second": 4.839, "step": 8 }, { "epoch": 0.0309490227684868, "grad_norm": 20.236906051635742, "learning_rate": 9.997251843068762e-05, "loss": 2.0347, "step": 12 }, { "epoch": 0.04126536369131573, "grad_norm": 11.911772727966309, "learning_rate": 9.989010393221656e-05, "loss": 2.0461, "step": 16 }, { "epoch": 0.04126536369131573, "eval_loss": 2.0334763526916504, "eval_runtime": 69.6741, "eval_samples_per_second": 3.531, "eval_steps_per_second": 3.531, "step": 16 }, { "epoch": 0.05158170461414467, "grad_norm": 13.50040340423584, "learning_rate": 9.97528470997769e-05, "loss": 1.995, "step": 20 }, { "epoch": 0.0618980455369736, "grad_norm": 12.473864555358887, "learning_rate": 9.956089881469482e-05, "loss": 2.0456, "step": 24 }, { "epoch": 0.0618980455369736, "eval_loss": 2.060091018676758, "eval_runtime": 49.4489, "eval_samples_per_second": 4.975, "eval_steps_per_second": 4.975, "step": 24 }, { "epoch": 0.07221438645980253, "grad_norm": 16.583091735839844, "learning_rate": 9.931447007857432e-05, "loss": 2.0845, "step": 28 }, { "epoch": 0.08253072738263147, "grad_norm": 38.525352478027344, "learning_rate": 9.901383178135113e-05, "loss": 2.0584, "step": 32 }, { "epoch": 0.08253072738263147, "eval_loss": 2.087923765182495, "eval_runtime": 68.1674, "eval_samples_per_second": 3.609, "eval_steps_per_second": 3.609, "step": 32 }, { "epoch": 0.09284706830546041, "grad_norm": 18.212785720825195, "learning_rate": 9.865931440351337e-05, "loss": 2.0671, "step": 36 }, { "epoch": 0.10316340922828934, "grad_norm": 18.464550018310547, "learning_rate": 9.825130765281668e-05, "loss": 2.1123, "step": 40 }, { "epoch": 0.10316340922828934, "eval_loss": 2.080946922302246, "eval_runtime": 51.3512, "eval_samples_per_second": 4.791, "eval_steps_per_second": 4.791, "step": 40 }, { "epoch": 0.11347975015111827, "grad_norm": 18.683780670166016, "learning_rate": 9.779026003589304e-05, "loss": 2.0638, "step": 44 }, { "epoch": 0.1237960910739472, "grad_norm": 20.039987564086914, "learning_rate": 9.727667836522407e-05, "loss": 2.0666, "step": 48 }, { "epoch": 0.1237960910739472, "eval_loss": 2.0889713764190674, "eval_runtime": 49.6263, "eval_samples_per_second": 4.957, "eval_steps_per_second": 4.957, "step": 48 }, { "epoch": 0.13411243199677614, "grad_norm": 23.5998477935791, "learning_rate": 9.6711127202021e-05, "loss": 2.1188, "step": 52 }, { "epoch": 0.14442877291960507, "grad_norm": 16.58580780029297, "learning_rate": 9.609422823562345e-05, "loss": 2.0733, "step": 56 }, { "epoch": 0.14442877291960507, "eval_loss": 2.0954222679138184, "eval_runtime": 68.3777, "eval_samples_per_second": 3.598, "eval_steps_per_second": 3.598, "step": 56 }, { "epoch": 0.154745113842434, "grad_norm": 17.798566818237305, "learning_rate": 9.542665960009959e-05, "loss": 2.0855, "step": 60 }, { "epoch": 0.16506145476526293, "grad_norm": 16.25016212463379, "learning_rate": 9.470915512879852e-05, "loss": 2.1236, "step": 64 }, { "epoch": 0.16506145476526293, "eval_loss": 2.0970985889434814, "eval_runtime": 50.5579, "eval_samples_per_second": 4.866, "eval_steps_per_second": 4.866, "step": 64 }, { "epoch": 0.1753777956880919, "grad_norm": 14.97128677368164, "learning_rate": 9.394250354767467e-05, "loss": 2.104, "step": 68 }, { "epoch": 0.18569413661092082, "grad_norm": 18.22063636779785, "learning_rate": 9.312754760827061e-05, "loss": 2.1103, "step": 72 }, { "epoch": 0.18569413661092082, "eval_loss": 2.1007986068725586, "eval_runtime": 68.15, "eval_samples_per_second": 3.61, "eval_steps_per_second": 3.61, "step": 72 }, { "epoch": 0.19601047753374976, "grad_norm": 15.88463306427002, "learning_rate": 9.226518316131176e-05, "loss": 2.1123, "step": 76 }, { "epoch": 0.2063268184565787, "grad_norm": 16.19556999206543, "learning_rate": 9.1356358171931e-05, "loss": 2.0876, "step": 80 }, { "epoch": 0.2063268184565787, "eval_loss": 2.104221820831299, "eval_runtime": 51.0209, "eval_samples_per_second": 4.822, "eval_steps_per_second": 4.822, "step": 80 }, { "epoch": 0.21664315937940762, "grad_norm": 21.194753646850586, "learning_rate": 9.040207167760586e-05, "loss": 2.0851, "step": 84 }, { "epoch": 0.22695950030223655, "grad_norm": 16.5502986907959, "learning_rate": 8.940337268995385e-05, "loss": 2.1107, "step": 88 }, { "epoch": 0.22695950030223655, "eval_loss": 2.115513324737549, "eval_runtime": 71.0739, "eval_samples_per_second": 3.461, "eval_steps_per_second": 3.461, "step": 88 }, { "epoch": 0.23727584122506548, "grad_norm": 16.366037368774414, "learning_rate": 8.836135904159302e-05, "loss": 2.1165, "step": 92 }, { "epoch": 0.2475921821478944, "grad_norm": 19.07808494567871, "learning_rate": 8.727717617933544e-05, "loss": 2.0889, "step": 96 }, { "epoch": 0.2475921821478944, "eval_loss": 2.1082587242126465, "eval_runtime": 51.4649, "eval_samples_per_second": 4.78, "eval_steps_per_second": 4.78, "step": 96 }, { "epoch": 0.25790852307072337, "grad_norm": 16.868947982788086, "learning_rate": 8.615201590504017e-05, "loss": 2.1333, "step": 100 }, { "epoch": 0.2682248639935523, "grad_norm": 18.22587013244629, "learning_rate": 8.498711506550983e-05, "loss": 2.097, "step": 104 }, { "epoch": 0.2682248639935523, "eval_loss": 2.1185896396636963, "eval_runtime": 50.4577, "eval_samples_per_second": 4.875, "eval_steps_per_second": 4.875, "step": 104 }, { "epoch": 0.27854120491638124, "grad_norm": 17.453964233398438, "learning_rate": 8.378375419287099e-05, "loss": 2.1481, "step": 108 }, { "epoch": 0.28885754583921014, "grad_norm": 15.08243465423584, "learning_rate": 8.25432560969328e-05, "loss": 2.0962, "step": 112 }, { "epoch": 0.28885754583921014, "eval_loss": 2.120164394378662, "eval_runtime": 68.8529, "eval_samples_per_second": 3.573, "eval_steps_per_second": 3.573, "step": 112 }, { "epoch": 0.2991738867620391, "grad_norm": 15.695137977600098, "learning_rate": 8.126698441107146e-05, "loss": 2.1125, "step": 116 }, { "epoch": 0.309490227684868, "grad_norm": 28.060436248779297, "learning_rate": 7.995634209323886e-05, "loss": 2.1415, "step": 120 }, { "epoch": 0.309490227684868, "eval_loss": 2.1305339336395264, "eval_runtime": 50.6817, "eval_samples_per_second": 4.854, "eval_steps_per_second": 4.854, "step": 120 }, { "epoch": 0.31980656860769696, "grad_norm": 17.550668716430664, "learning_rate": 7.861276988374302e-05, "loss": 2.141, "step": 124 }, { "epoch": 0.33012290953052587, "grad_norm": 20.923986434936523, "learning_rate": 7.723774472149601e-05, "loss": 2.1294, "step": 128 }, { "epoch": 0.33012290953052587, "eval_loss": 2.1169350147247314, "eval_runtime": 68.8987, "eval_samples_per_second": 3.57, "eval_steps_per_second": 3.57, "step": 128 }, { "epoch": 0.3404392504533548, "grad_norm": 18.64177131652832, "learning_rate": 7.583277812046993e-05, "loss": 2.1473, "step": 132 }, { "epoch": 0.3507555913761838, "grad_norm": 20.397233963012695, "learning_rate": 7.439941450814591e-05, "loss": 2.1476, "step": 136 }, { "epoch": 0.3507555913761838, "eval_loss": 2.13000226020813, "eval_runtime": 52.251, "eval_samples_per_second": 4.708, "eval_steps_per_second": 4.708, "step": 136 }, { "epoch": 0.3610719322990127, "grad_norm": 14.290401458740234, "learning_rate": 7.293922952778239e-05, "loss": 2.1519, "step": 140 }, { "epoch": 0.37138827322184165, "grad_norm": 19.956518173217773, "learning_rate": 7.145382830636924e-05, "loss": 2.1725, "step": 144 }, { "epoch": 0.37138827322184165, "eval_loss": 2.124537467956543, "eval_runtime": 66.2352, "eval_samples_per_second": 3.714, "eval_steps_per_second": 3.714, "step": 144 }, { "epoch": 0.38170461414467055, "grad_norm": 18.11756134033203, "learning_rate": 6.994484369017143e-05, "loss": 2.131, "step": 148 }, { "epoch": 0.3920209550674995, "grad_norm": 15.948026657104492, "learning_rate": 6.841393444980177e-05, "loss": 2.1159, "step": 152 }, { "epoch": 0.3920209550674995, "eval_loss": 2.117161273956299, "eval_runtime": 65.9204, "eval_samples_per_second": 3.732, "eval_steps_per_second": 3.732, "step": 152 }, { "epoch": 0.4023372959903284, "grad_norm": 17.677406311035156, "learning_rate": 6.686278345679625e-05, "loss": 2.0999, "step": 156 }, { "epoch": 0.4126536369131574, "grad_norm": 18.509098052978516, "learning_rate": 6.529309583369605e-05, "loss": 2.0921, "step": 160 }, { "epoch": 0.4126536369131574, "eval_loss": 2.122069835662842, "eval_runtime": 52.1961, "eval_samples_per_second": 4.713, "eval_steps_per_second": 4.713, "step": 160 }, { "epoch": 0.4229699778359863, "grad_norm": 19.927303314208984, "learning_rate": 6.370659707966967e-05, "loss": 2.1235, "step": 164 }, { "epoch": 0.43328631875881524, "grad_norm": 17.947538375854492, "learning_rate": 6.2105031173736e-05, "loss": 2.141, "step": 168 }, { "epoch": 0.43328631875881524, "eval_loss": 2.1334285736083984, "eval_runtime": 66.1797, "eval_samples_per_second": 3.717, "eval_steps_per_second": 3.717, "step": 168 }, { "epoch": 0.44360265968164414, "grad_norm": 17.444217681884766, "learning_rate": 6.049015865767318e-05, "loss": 2.0906, "step": 172 }, { "epoch": 0.4539190006044731, "grad_norm": 16.887006759643555, "learning_rate": 5.88637547007204e-05, "loss": 2.1312, "step": 176 }, { "epoch": 0.4539190006044731, "eval_loss": 2.125943183898926, "eval_runtime": 50.8568, "eval_samples_per_second": 4.837, "eval_steps_per_second": 4.837, "step": 176 }, { "epoch": 0.46423534152730206, "grad_norm": 16.4300537109375, "learning_rate": 5.722760714820057e-05, "loss": 2.1426, "step": 180 }, { "epoch": 0.47455168245013096, "grad_norm": 17.936267852783203, "learning_rate": 5.5583514556208514e-05, "loss": 2.106, "step": 184 }, { "epoch": 0.47455168245013096, "eval_loss": 2.126936912536621, "eval_runtime": 72.4228, "eval_samples_per_second": 3.397, "eval_steps_per_second": 3.397, "step": 184 }, { "epoch": 0.4848680233729599, "grad_norm": 15.06416130065918, "learning_rate": 5.393328421452514e-05, "loss": 2.1054, "step": 188 }, { "epoch": 0.4951843642957888, "grad_norm": 16.073657989501953, "learning_rate": 5.2278730159931076e-05, "loss": 2.1015, "step": 192 }, { "epoch": 0.4951843642957888, "eval_loss": 2.1197259426116943, "eval_runtime": 50.2504, "eval_samples_per_second": 4.895, "eval_steps_per_second": 4.895, "step": 192 }, { "epoch": 0.5055007052186178, "grad_norm": 18.07906723022461, "learning_rate": 5.062167118210367e-05, "loss": 2.1588, "step": 196 }, { "epoch": 0.5158170461414467, "grad_norm": 16.21997833251953, "learning_rate": 4.896392882428901e-05, "loss": 2.1368, "step": 200 }, { "epoch": 0.5158170461414467, "eval_loss": 2.1163711547851562, "eval_runtime": 51.0039, "eval_samples_per_second": 4.823, "eval_steps_per_second": 4.823, "step": 200 }, { "epoch": 0.5261333870642756, "grad_norm": 17.681896209716797, "learning_rate": 4.730732538094749e-05, "loss": 2.1108, "step": 204 }, { "epoch": 0.5364497279871046, "grad_norm": 14.08202838897705, "learning_rate": 4.565368189457313e-05, "loss": 2.0751, "step": 208 }, { "epoch": 0.5364497279871046, "eval_loss": 2.1104061603546143, "eval_runtime": 70.8566, "eval_samples_per_second": 3.472, "eval_steps_per_second": 3.472, "step": 208 }, { "epoch": 0.5467660689099335, "grad_norm": 17.15287971496582, "learning_rate": 4.400481615388948e-05, "loss": 2.0737, "step": 212 }, { "epoch": 0.5570824098327625, "grad_norm": 14.733731269836426, "learning_rate": 4.236254069562213e-05, "loss": 2.135, "step": 216 }, { "epoch": 0.5570824098327625, "eval_loss": 2.1105358600616455, "eval_runtime": 50.3866, "eval_samples_per_second": 4.882, "eval_steps_per_second": 4.882, "step": 216 }, { "epoch": 0.5673987507555914, "grad_norm": 12.141791343688965, "learning_rate": 4.0728660812044536e-05, "loss": 2.1395, "step": 220 }, { "epoch": 0.5777150916784203, "grad_norm": 13.20874309539795, "learning_rate": 3.910497256648742e-05, "loss": 2.0718, "step": 224 }, { "epoch": 0.5777150916784203, "eval_loss": 2.100306749343872, "eval_runtime": 68.0529, "eval_samples_per_second": 3.615, "eval_steps_per_second": 3.615, "step": 224 }, { "epoch": 0.5880314326012492, "grad_norm": 14.548846244812012, "learning_rate": 3.749326081899329e-05, "loss": 2.0944, "step": 228 }, { "epoch": 0.5983477735240782, "grad_norm": 15.402515411376953, "learning_rate": 3.589529726428615e-05, "loss": 2.0393, "step": 232 }, { "epoch": 0.5983477735240782, "eval_loss": 2.1025285720825195, "eval_runtime": 51.9996, "eval_samples_per_second": 4.731, "eval_steps_per_second": 4.731, "step": 232 }, { "epoch": 0.6086641144469072, "grad_norm": 13.062746047973633, "learning_rate": 3.431283848421347e-05, "loss": 2.1215, "step": 236 }, { "epoch": 0.618980455369736, "grad_norm": 16.821453094482422, "learning_rate": 3.274762401680124e-05, "loss": 2.1034, "step": 240 }, { "epoch": 0.618980455369736, "eval_loss": 2.0945794582366943, "eval_runtime": 69.0514, "eval_samples_per_second": 3.563, "eval_steps_per_second": 3.563, "step": 240 }, { "epoch": 0.629296796292565, "grad_norm": 13.247776985168457, "learning_rate": 3.120137444404442e-05, "loss": 2.0619, "step": 244 }, { "epoch": 0.6396131372153939, "grad_norm": 12.850509643554688, "learning_rate": 2.9675789500535328e-05, "loss": 2.045, "step": 248 }, { "epoch": 0.6396131372153939, "eval_loss": 2.093926191329956, "eval_runtime": 49.7964, "eval_samples_per_second": 4.94, "eval_steps_per_second": 4.94, "step": 248 }, { "epoch": 0.6499294781382229, "grad_norm": 15.139781951904297, "learning_rate": 2.8172546205008683e-05, "loss": 2.1207, "step": 252 }, { "epoch": 0.6602458190610517, "grad_norm": 13.269444465637207, "learning_rate": 2.6693297016857188e-05, "loss": 2.077, "step": 256 }, { "epoch": 0.6602458190610517, "eval_loss": 2.081383466720581, "eval_runtime": 52.4381, "eval_samples_per_second": 4.691, "eval_steps_per_second": 4.691, "step": 256 }, { "epoch": 0.6705621599838807, "grad_norm": 11.657261848449707, "learning_rate": 2.523966801964468e-05, "loss": 2.0693, "step": 260 }, { "epoch": 0.6808785009067096, "grad_norm": 11.285372734069824, "learning_rate": 2.3813257133612827e-05, "loss": 2.0514, "step": 264 }, { "epoch": 0.6808785009067096, "eval_loss": 2.0800182819366455, "eval_runtime": 66.6605, "eval_samples_per_second": 3.69, "eval_steps_per_second": 3.69, "step": 264 }, { "epoch": 0.6911948418295386, "grad_norm": 13.049285888671875, "learning_rate": 2.2415632359146856e-05, "loss": 2.0855, "step": 268 }, { "epoch": 0.7015111827523676, "grad_norm": 11.901713371276855, "learning_rate": 2.104833005313131e-05, "loss": 2.0222, "step": 272 }, { "epoch": 0.7015111827523676, "eval_loss": 2.0774030685424805, "eval_runtime": 51.2508, "eval_samples_per_second": 4.8, "eval_steps_per_second": 4.8, "step": 272 }, { "epoch": 0.7118275236751964, "grad_norm": 12.342489242553711, "learning_rate": 1.971285324008994e-05, "loss": 2.1345, "step": 276 }, { "epoch": 0.7221438645980254, "grad_norm": 13.865422248840332, "learning_rate": 1.84106699599668e-05, "loss": 2.075, "step": 280 }, { "epoch": 0.7221438645980254, "eval_loss": 2.074871301651001, "eval_runtime": 63.1191, "eval_samples_per_second": 3.897, "eval_steps_per_second": 3.897, "step": 280 }, { "epoch": 0.7324602055208543, "grad_norm": 12.347137451171875, "learning_rate": 1.7143211654364762e-05, "loss": 2.0861, "step": 284 }, { "epoch": 0.7427765464436833, "grad_norm": 15.943358421325684, "learning_rate": 1.5911871593014837e-05, "loss": 2.1013, "step": 288 }, { "epoch": 0.7427765464436833, "eval_loss": 2.070469379425049, "eval_runtime": 50.0079, "eval_samples_per_second": 4.919, "eval_steps_per_second": 4.919, "step": 288 }, { "epoch": 0.7530928873665121, "grad_norm": 11.594297409057617, "learning_rate": 1.4718003342206722e-05, "loss": 2.053, "step": 292 }, { "epoch": 0.7634092282893411, "grad_norm": 13.693614959716797, "learning_rate": 1.3562919276863844e-05, "loss": 2.0929, "step": 296 }, { "epoch": 0.7634092282893411, "eval_loss": 2.064333438873291, "eval_runtime": 51.108, "eval_samples_per_second": 4.813, "eval_steps_per_second": 4.813, "step": 296 }, { "epoch": 0.7737255692121701, "grad_norm": 12.406514167785645, "learning_rate": 1.2447889137898293e-05, "loss": 2.1147, "step": 300 }, { "epoch": 0.784041910134999, "grad_norm": 14.044355392456055, "learning_rate": 1.1374138636432053e-05, "loss": 2.0996, "step": 304 }, { "epoch": 0.784041910134999, "eval_loss": 2.0692458152770996, "eval_runtime": 67.0744, "eval_samples_per_second": 3.668, "eval_steps_per_second": 3.668, "step": 304 }, { "epoch": 0.794358251057828, "grad_norm": 10.247350692749023, "learning_rate": 1.0342848106418368e-05, "loss": 2.0892, "step": 308 }, { "epoch": 0.8046745919806568, "grad_norm": 10.73900032043457, "learning_rate": 9.35515120714447e-06, "loss": 2.0507, "step": 312 }, { "epoch": 0.8046745919806568, "eval_loss": 2.0587587356567383, "eval_runtime": 49.942, "eval_samples_per_second": 4.926, "eval_steps_per_second": 4.926, "step": 312 }, { "epoch": 0.8149909329034858, "grad_norm": 10.756080627441406, "learning_rate": 8.41213367704224e-06, "loss": 2.0867, "step": 316 }, { "epoch": 0.8253072738263147, "grad_norm": 14.330180168151855, "learning_rate": 7.51483214017637e-06, "loss": 2.0353, "step": 320 }, { "epoch": 0.8253072738263147, "eval_loss": 2.0574405193328857, "eval_runtime": 68.5693, "eval_samples_per_second": 3.588, "eval_steps_per_second": 3.588, "step": 320 }, { "epoch": 0.8356236147491437, "grad_norm": 10.09432601928711, "learning_rate": 6.664232966721995e-06, "loss": 2.0535, "step": 324 }, { "epoch": 0.8459399556719726, "grad_norm": 12.156997680664062, "learning_rate": 5.8612711886848196e-06, "loss": 2.0128, "step": 328 }, { "epoch": 0.8459399556719726, "eval_loss": 2.056994915008545, "eval_runtime": 49.8867, "eval_samples_per_second": 4.931, "eval_steps_per_second": 4.931, "step": 328 }, { "epoch": 0.8562562965948015, "grad_norm": 13.915254592895508, "learning_rate": 5.106829472055202e-06, "loss": 2.0233, "step": 332 }, { "epoch": 0.8665726375176305, "grad_norm": 14.312435150146484, "learning_rate": 4.401737146526219e-06, "loss": 2.0508, "step": 336 }, { "epoch": 0.8665726375176305, "eval_loss": 2.050326108932495, "eval_runtime": 67.6182, "eval_samples_per_second": 3.638, "eval_steps_per_second": 3.638, "step": 336 }, { "epoch": 0.8768889784404594, "grad_norm": 12.76665210723877, "learning_rate": 3.7467692938425057e-06, "loss": 2.0105, "step": 340 }, { "epoch": 0.8872053193632883, "grad_norm": 17.83547019958496, "learning_rate": 3.142645895781715e-06, "loss": 2.067, "step": 344 }, { "epoch": 0.8872053193632883, "eval_loss": 2.0471653938293457, "eval_runtime": 157.8315, "eval_samples_per_second": 1.559, "eval_steps_per_second": 1.559, "step": 344 }, { "epoch": 0.8975216602861172, "grad_norm": 10.910913467407227, "learning_rate": 2.5900310427053044e-06, "loss": 2.0453, "step": 348 }, { "epoch": 0.9078380012089462, "grad_norm": 9.634235382080078, "learning_rate": 2.089532203548794e-06, "loss": 2.0821, "step": 352 }, { "epoch": 0.9078380012089462, "eval_loss": 2.047574281692505, "eval_runtime": 229.272, "eval_samples_per_second": 1.073, "eval_steps_per_second": 1.073, "step": 352 }, { "epoch": 0.9181543421317752, "grad_norm": 11.296228408813477, "learning_rate": 1.6416995580537664e-06, "loss": 2.0581, "step": 356 }, { "epoch": 0.9284706830546041, "grad_norm": 12.595250129699707, "learning_rate": 1.247025391975698e-06, "loss": 2.0461, "step": 360 }, { "epoch": 0.9284706830546041, "eval_loss": 2.04707932472229, "eval_runtime": 278.4472, "eval_samples_per_second": 0.883, "eval_steps_per_second": 0.883, "step": 360 }, { "epoch": 0.938787023977433, "grad_norm": 11.871248245239258, "learning_rate": 9.059435559326257e-07, "loss": 2.0658, "step": 364 }, { "epoch": 0.9491033649002619, "grad_norm": 11.920108795166016, "learning_rate": 6.188289884893062e-07, "loss": 2.0666, "step": 368 }, { "epoch": 0.9491033649002619, "eval_loss": 2.046149492263794, "eval_runtime": 241.2948, "eval_samples_per_second": 1.019, "eval_steps_per_second": 1.019, "step": 368 }, { "epoch": 0.9594197058230909, "grad_norm": 12.083566665649414, "learning_rate": 3.8599730400115107e-07, "loss": 2.0008, "step": 372 }, { "epoch": 0.9697360467459198, "grad_norm": 11.261472702026367, "learning_rate": 2.0770444567118075e-07, "loss": 2.0639, "step": 376 }, { "epoch": 0.9697360467459198, "eval_loss": 2.0458359718322754, "eval_runtime": 377.6181, "eval_samples_per_second": 0.651, "eval_steps_per_second": 0.651, "step": 376 }, { "epoch": 0.9800523876687487, "grad_norm": 10.003210067749023, "learning_rate": 8.414640420116305e-08, "loss": 2.0595, "step": 380 }, { "epoch": 0.9903687285915777, "grad_norm": 9.608034133911133, "learning_rate": 1.5459002346324135e-08, "loss": 1.9859, "step": 384 }, { "epoch": 0.9903687285915777, "eval_loss": 2.04579496383667, "eval_runtime": 243.787, "eval_samples_per_second": 1.009, "eval_steps_per_second": 1.009, "step": 384 } ], "logging_steps": 4, "max_steps": 387, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 8, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.501033446014976e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }