{ "best_metric": 2.037567615509033, "best_model_checkpoint": "/home/datta0/models/lora_final/Mistral-7B-v0.3_pct_default_r32/checkpoint-16", "epoch": 0.9981059842836993, "eval_steps": 8, "global_step": 387, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025790852307072333, "grad_norm": 23.78647804260254, "learning_rate": 1.25e-05, "loss": 2.2237, "step": 1 }, { "epoch": 0.010316340922828933, "grad_norm": 10.68380355834961, "learning_rate": 5e-05, "loss": 2.0556, "step": 4 }, { "epoch": 0.020632681845657867, "grad_norm": 13.343311309814453, "learning_rate": 0.0001, "loss": 1.9915, "step": 8 }, { "epoch": 0.020632681845657867, "eval_loss": 2.0385255813598633, "eval_runtime": 251.2319, "eval_samples_per_second": 0.979, "eval_steps_per_second": 0.979, "step": 8 }, { "epoch": 0.0309490227684868, "grad_norm": 19.941059112548828, "learning_rate": 9.997251843068762e-05, "loss": 2.0389, "step": 12 }, { "epoch": 0.04126536369131573, "grad_norm": 13.265397071838379, "learning_rate": 9.989010393221656e-05, "loss": 2.054, "step": 16 }, { "epoch": 0.04126536369131573, "eval_loss": 2.037567615509033, "eval_runtime": 356.5096, "eval_samples_per_second": 0.69, "eval_steps_per_second": 0.69, "step": 16 }, { "epoch": 0.05158170461414467, "grad_norm": 12.110239028930664, "learning_rate": 9.97528470997769e-05, "loss": 1.9974, "step": 20 }, { "epoch": 0.0618980455369736, "grad_norm": 15.224806785583496, "learning_rate": 9.956089881469482e-05, "loss": 2.0356, "step": 24 }, { "epoch": 0.0618980455369736, "eval_loss": 2.060382127761841, "eval_runtime": 260.5723, "eval_samples_per_second": 0.944, "eval_steps_per_second": 0.944, "step": 24 }, { "epoch": 0.07221438645980253, "grad_norm": 15.310386657714844, "learning_rate": 9.931447007857432e-05, "loss": 2.0835, "step": 28 }, { "epoch": 0.08253072738263147, "grad_norm": 14.485530853271484, "learning_rate": 9.901383178135113e-05, "loss": 2.0385, "step": 32 }, { "epoch": 0.08253072738263147, "eval_loss": 2.0639231204986572, "eval_runtime": 347.075, "eval_samples_per_second": 0.709, "eval_steps_per_second": 0.709, "step": 32 }, { "epoch": 0.09284706830546041, "grad_norm": 23.205764770507812, "learning_rate": 9.865931440351337e-05, "loss": 2.0658, "step": 36 }, { "epoch": 0.10316340922828934, "grad_norm": 20.585407257080078, "learning_rate": 9.825130765281668e-05, "loss": 2.1223, "step": 40 }, { "epoch": 0.10316340922828934, "eval_loss": 2.083315849304199, "eval_runtime": 244.456, "eval_samples_per_second": 1.006, "eval_steps_per_second": 1.006, "step": 40 }, { "epoch": 0.11347975015111827, "grad_norm": 23.029006958007812, "learning_rate": 9.779026003589304e-05, "loss": 2.0669, "step": 44 }, { "epoch": 0.1237960910739472, "grad_norm": 27.581724166870117, "learning_rate": 9.727667836522407e-05, "loss": 2.0677, "step": 48 }, { "epoch": 0.1237960910739472, "eval_loss": 2.0909621715545654, "eval_runtime": 258.3491, "eval_samples_per_second": 0.952, "eval_steps_per_second": 0.952, "step": 48 }, { "epoch": 0.13411243199677614, "grad_norm": 20.641569137573242, "learning_rate": 9.6711127202021e-05, "loss": 2.118, "step": 52 }, { "epoch": 0.14442877291960507, "grad_norm": 15.354959487915039, "learning_rate": 9.609422823562345e-05, "loss": 2.0729, "step": 56 }, { "epoch": 0.14442877291960507, "eval_loss": 2.0872063636779785, "eval_runtime": 366.7103, "eval_samples_per_second": 0.671, "eval_steps_per_second": 0.671, "step": 56 }, { "epoch": 0.154745113842434, "grad_norm": 13.397254943847656, "learning_rate": 9.542665960009959e-05, "loss": 2.0806, "step": 60 }, { "epoch": 0.16506145476526293, "grad_norm": 14.432162284851074, "learning_rate": 9.470915512879852e-05, "loss": 2.1197, "step": 64 }, { "epoch": 0.16506145476526293, "eval_loss": 2.0972707271575928, "eval_runtime": 241.6226, "eval_samples_per_second": 1.018, "eval_steps_per_second": 1.018, "step": 64 }, { "epoch": 0.1753777956880919, "grad_norm": 16.219799041748047, "learning_rate": 9.394250354767467e-05, "loss": 2.1002, "step": 68 }, { "epoch": 0.18569413661092082, "grad_norm": 19.845632553100586, "learning_rate": 9.312754760827061e-05, "loss": 2.1053, "step": 72 }, { "epoch": 0.18569413661092082, "eval_loss": 2.091938018798828, "eval_runtime": 374.7664, "eval_samples_per_second": 0.656, "eval_steps_per_second": 0.656, "step": 72 }, { "epoch": 0.19601047753374976, "grad_norm": 19.960819244384766, "learning_rate": 9.226518316131176e-05, "loss": 2.1111, "step": 76 }, { "epoch": 0.2063268184565787, "grad_norm": 22.90218734741211, "learning_rate": 9.1356358171931e-05, "loss": 2.0848, "step": 80 }, { "epoch": 0.2063268184565787, "eval_loss": 2.1034836769104004, "eval_runtime": 250.3178, "eval_samples_per_second": 0.983, "eval_steps_per_second": 0.983, "step": 80 }, { "epoch": 0.21664315937940762, "grad_norm": 20.233325958251953, "learning_rate": 9.040207167760586e-05, "loss": 2.0845, "step": 84 }, { "epoch": 0.22695950030223655, "grad_norm": 18.64703941345215, "learning_rate": 8.940337268995385e-05, "loss": 2.1015, "step": 88 }, { "epoch": 0.22695950030223655, "eval_loss": 2.1114280223846436, "eval_runtime": 378.994, "eval_samples_per_second": 0.649, "eval_steps_per_second": 0.649, "step": 88 }, { "epoch": 0.23727584122506548, "grad_norm": 12.925492286682129, "learning_rate": 8.836135904159302e-05, "loss": 2.1163, "step": 92 }, { "epoch": 0.2475921821478944, "grad_norm": 17.775686264038086, "learning_rate": 8.727717617933544e-05, "loss": 2.0872, "step": 96 }, { "epoch": 0.2475921821478944, "eval_loss": 2.113255023956299, "eval_runtime": 253.955, "eval_samples_per_second": 0.969, "eval_steps_per_second": 0.969, "step": 96 }, { "epoch": 0.25790852307072337, "grad_norm": 13.843049049377441, "learning_rate": 8.615201590504017e-05, "loss": 2.1374, "step": 100 }, { "epoch": 0.2682248639935523, "grad_norm": 17.85464859008789, "learning_rate": 8.498711506550983e-05, "loss": 2.0948, "step": 104 }, { "epoch": 0.2682248639935523, "eval_loss": 2.1220598220825195, "eval_runtime": 250.0959, "eval_samples_per_second": 0.984, "eval_steps_per_second": 0.984, "step": 104 }, { "epoch": 0.27854120491638124, "grad_norm": 14.508893966674805, "learning_rate": 8.378375419287099e-05, "loss": 2.1539, "step": 108 }, { "epoch": 0.28885754583921014, "grad_norm": 14.467598915100098, "learning_rate": 8.25432560969328e-05, "loss": 2.097, "step": 112 }, { "epoch": 0.28885754583921014, "eval_loss": 2.1218836307525635, "eval_runtime": 366.7235, "eval_samples_per_second": 0.671, "eval_steps_per_second": 0.671, "step": 112 }, { "epoch": 0.2991738867620391, "grad_norm": 25.095067977905273, "learning_rate": 8.126698441107146e-05, "loss": 2.1127, "step": 116 }, { "epoch": 0.309490227684868, "grad_norm": 16.594051361083984, "learning_rate": 7.995634209323886e-05, "loss": 2.147, "step": 120 }, { "epoch": 0.309490227684868, "eval_loss": 2.1240272521972656, "eval_runtime": 255.4843, "eval_samples_per_second": 0.963, "eval_steps_per_second": 0.963, "step": 120 }, { "epoch": 0.31980656860769696, "grad_norm": 14.561055183410645, "learning_rate": 7.861276988374302e-05, "loss": 2.1397, "step": 124 }, { "epoch": 0.33012290953052587, "grad_norm": 16.39865493774414, "learning_rate": 7.723774472149601e-05, "loss": 2.1315, "step": 128 }, { "epoch": 0.33012290953052587, "eval_loss": 2.1188597679138184, "eval_runtime": 389.6053, "eval_samples_per_second": 0.631, "eval_steps_per_second": 0.631, "step": 128 }, { "epoch": 0.3404392504533548, "grad_norm": 15.670085906982422, "learning_rate": 7.583277812046993e-05, "loss": 2.1466, "step": 132 }, { "epoch": 0.3507555913761838, "grad_norm": 22.689796447753906, "learning_rate": 7.439941450814591e-05, "loss": 2.1563, "step": 136 }, { "epoch": 0.3507555913761838, "eval_loss": 2.136768341064453, "eval_runtime": 252.2156, "eval_samples_per_second": 0.975, "eval_steps_per_second": 0.975, "step": 136 }, { "epoch": 0.3610719322990127, "grad_norm": 18.863473892211914, "learning_rate": 7.293922952778239e-05, "loss": 2.1586, "step": 140 }, { "epoch": 0.37138827322184165, "grad_norm": 17.896759033203125, "learning_rate": 7.145382830636924e-05, "loss": 2.1836, "step": 144 }, { "epoch": 0.37138827322184165, "eval_loss": 2.127108335494995, "eval_runtime": 364.7213, "eval_samples_per_second": 0.674, "eval_steps_per_second": 0.674, "step": 144 }, { "epoch": 0.38170461414467055, "grad_norm": 21.21641731262207, "learning_rate": 6.994484369017143e-05, "loss": 2.1371, "step": 148 }, { "epoch": 0.3920209550674995, "grad_norm": 15.671196937561035, "learning_rate": 6.841393444980177e-05, "loss": 2.1245, "step": 152 }, { "epoch": 0.3920209550674995, "eval_loss": 2.1197926998138428, "eval_runtime": 395.7933, "eval_samples_per_second": 0.622, "eval_steps_per_second": 0.622, "step": 152 }, { "epoch": 0.4023372959903284, "grad_norm": 16.22295379638672, "learning_rate": 6.686278345679625e-05, "loss": 2.1008, "step": 156 }, { "epoch": 0.4126536369131574, "grad_norm": 14.019622802734375, "learning_rate": 6.529309583369605e-05, "loss": 2.0947, "step": 160 }, { "epoch": 0.4126536369131574, "eval_loss": 2.123974561691284, "eval_runtime": 257.3823, "eval_samples_per_second": 0.956, "eval_steps_per_second": 0.956, "step": 160 }, { "epoch": 0.4229699778359863, "grad_norm": 18.07195281982422, "learning_rate": 6.370659707966967e-05, "loss": 2.1187, "step": 164 }, { "epoch": 0.43328631875881524, "grad_norm": 19.1149959564209, "learning_rate": 6.2105031173736e-05, "loss": 2.1472, "step": 168 }, { "epoch": 0.43328631875881524, "eval_loss": 2.13541579246521, "eval_runtime": 358.8473, "eval_samples_per_second": 0.686, "eval_steps_per_second": 0.686, "step": 168 }, { "epoch": 0.44360265968164414, "grad_norm": 17.229467391967773, "learning_rate": 6.049015865767318e-05, "loss": 2.0992, "step": 172 }, { "epoch": 0.4539190006044731, "grad_norm": 14.356927871704102, "learning_rate": 5.88637547007204e-05, "loss": 2.1348, "step": 176 }, { "epoch": 0.4539190006044731, "eval_loss": 2.1260793209075928, "eval_runtime": 248.3974, "eval_samples_per_second": 0.99, "eval_steps_per_second": 0.99, "step": 176 }, { "epoch": 0.46423534152730206, "grad_norm": 16.18288803100586, "learning_rate": 5.722760714820057e-05, "loss": 2.1451, "step": 180 }, { "epoch": 0.47455168245013096, "grad_norm": 16.129526138305664, "learning_rate": 5.5583514556208514e-05, "loss": 2.1099, "step": 184 }, { "epoch": 0.47455168245013096, "eval_loss": 2.127509355545044, "eval_runtime": 357.9954, "eval_samples_per_second": 0.687, "eval_steps_per_second": 0.687, "step": 184 }, { "epoch": 0.4848680233729599, "grad_norm": 17.3072509765625, "learning_rate": 5.393328421452514e-05, "loss": 2.108, "step": 188 }, { "epoch": 0.4951843642957888, "grad_norm": 15.436331748962402, "learning_rate": 5.2278730159931076e-05, "loss": 2.1006, "step": 192 }, { "epoch": 0.4951843642957888, "eval_loss": 2.1196277141571045, "eval_runtime": 266.5248, "eval_samples_per_second": 0.923, "eval_steps_per_second": 0.923, "step": 192 }, { "epoch": 0.5055007052186178, "grad_norm": 13.889219284057617, "learning_rate": 5.062167118210367e-05, "loss": 2.1588, "step": 196 }, { "epoch": 0.5158170461414467, "grad_norm": 15.053701400756836, "learning_rate": 4.896392882428901e-05, "loss": 2.1339, "step": 200 }, { "epoch": 0.5158170461414467, "eval_loss": 2.1169850826263428, "eval_runtime": 243.5045, "eval_samples_per_second": 1.01, "eval_steps_per_second": 1.01, "step": 200 }, { "epoch": 0.5261333870642756, "grad_norm": 15.929015159606934, "learning_rate": 4.730732538094749e-05, "loss": 2.1116, "step": 204 }, { "epoch": 0.5364497279871046, "grad_norm": 13.288613319396973, "learning_rate": 4.565368189457313e-05, "loss": 2.0841, "step": 208 }, { "epoch": 0.5364497279871046, "eval_loss": 2.1104753017425537, "eval_runtime": 365.3009, "eval_samples_per_second": 0.673, "eval_steps_per_second": 0.673, "step": 208 }, { "epoch": 0.5467660689099335, "grad_norm": 15.282868385314941, "learning_rate": 4.400481615388948e-05, "loss": 2.0709, "step": 212 }, { "epoch": 0.5570824098327625, "grad_norm": 16.86932945251465, "learning_rate": 4.236254069562213e-05, "loss": 2.1344, "step": 216 }, { "epoch": 0.5570824098327625, "eval_loss": 2.1079087257385254, "eval_runtime": 251.0859, "eval_samples_per_second": 0.98, "eval_steps_per_second": 0.98, "step": 216 }, { "epoch": 0.5673987507555914, "grad_norm": 15.028175354003906, "learning_rate": 4.0728660812044536e-05, "loss": 2.1397, "step": 220 }, { "epoch": 0.5777150916784203, "grad_norm": 14.224735260009766, "learning_rate": 3.910497256648742e-05, "loss": 2.0732, "step": 224 }, { "epoch": 0.5777150916784203, "eval_loss": 2.104301929473877, "eval_runtime": 366.8962, "eval_samples_per_second": 0.67, "eval_steps_per_second": 0.67, "step": 224 }, { "epoch": 0.5880314326012492, "grad_norm": 15.107568740844727, "learning_rate": 3.749326081899329e-05, "loss": 2.1003, "step": 228 }, { "epoch": 0.5983477735240782, "grad_norm": 12.49576473236084, "learning_rate": 3.589529726428615e-05, "loss": 2.0417, "step": 232 }, { "epoch": 0.5983477735240782, "eval_loss": 2.103517532348633, "eval_runtime": 243.1722, "eval_samples_per_second": 1.012, "eval_steps_per_second": 1.012, "step": 232 }, { "epoch": 0.6086641144469072, "grad_norm": 10.86776065826416, "learning_rate": 3.431283848421347e-05, "loss": 2.1209, "step": 236 }, { "epoch": 0.618980455369736, "grad_norm": 15.035432815551758, "learning_rate": 3.274762401680124e-05, "loss": 2.1003, "step": 240 }, { "epoch": 0.618980455369736, "eval_loss": 2.0966618061065674, "eval_runtime": 354.4771, "eval_samples_per_second": 0.694, "eval_steps_per_second": 0.694, "step": 240 }, { "epoch": 0.629296796292565, "grad_norm": 19.571407318115234, "learning_rate": 3.120137444404442e-05, "loss": 2.0631, "step": 244 }, { "epoch": 0.6396131372153939, "grad_norm": 12.437437057495117, "learning_rate": 2.9675789500535328e-05, "loss": 2.0501, "step": 248 }, { "epoch": 0.6396131372153939, "eval_loss": 2.1006603240966797, "eval_runtime": 242.766, "eval_samples_per_second": 1.013, "eval_steps_per_second": 1.013, "step": 248 }, { "epoch": 0.6499294781382229, "grad_norm": 14.765650749206543, "learning_rate": 2.8172546205008683e-05, "loss": 2.1215, "step": 252 }, { "epoch": 0.6602458190610517, "grad_norm": 12.97276496887207, "learning_rate": 2.6693297016857188e-05, "loss": 2.078, "step": 256 }, { "epoch": 0.6602458190610517, "eval_loss": 2.0861904621124268, "eval_runtime": 251.7783, "eval_samples_per_second": 0.977, "eval_steps_per_second": 0.977, "step": 256 }, { "epoch": 0.6705621599838807, "grad_norm": 13.271310806274414, "learning_rate": 2.523966801964468e-05, "loss": 2.0733, "step": 260 }, { "epoch": 0.6808785009067096, "grad_norm": 13.07889461517334, "learning_rate": 2.3813257133612827e-05, "loss": 2.0507, "step": 264 }, { "epoch": 0.6808785009067096, "eval_loss": 2.084022045135498, "eval_runtime": 351.4005, "eval_samples_per_second": 0.7, "eval_steps_per_second": 0.7, "step": 264 }, { "epoch": 0.6911948418295386, "grad_norm": 12.636960983276367, "learning_rate": 2.2415632359146856e-05, "loss": 2.0855, "step": 268 }, { "epoch": 0.7015111827523676, "grad_norm": 16.52528190612793, "learning_rate": 2.104833005313131e-05, "loss": 2.0235, "step": 272 }, { "epoch": 0.7015111827523676, "eval_loss": 2.076244354248047, "eval_runtime": 240.3826, "eval_samples_per_second": 1.023, "eval_steps_per_second": 1.023, "step": 272 }, { "epoch": 0.7118275236751964, "grad_norm": 11.037178039550781, "learning_rate": 1.971285324008994e-05, "loss": 2.1384, "step": 276 }, { "epoch": 0.7221438645980254, "grad_norm": 13.414417266845703, "learning_rate": 1.84106699599668e-05, "loss": 2.0743, "step": 280 }, { "epoch": 0.7221438645980254, "eval_loss": 2.072314500808716, "eval_runtime": 366.894, "eval_samples_per_second": 0.67, "eval_steps_per_second": 0.67, "step": 280 }, { "epoch": 0.7324602055208543, "grad_norm": 12.51091480255127, "learning_rate": 1.7143211654364762e-05, "loss": 2.0863, "step": 284 }, { "epoch": 0.7427765464436833, "grad_norm": 15.719615936279297, "learning_rate": 1.5911871593014837e-05, "loss": 2.1028, "step": 288 }, { "epoch": 0.7427765464436833, "eval_loss": 2.0720996856689453, "eval_runtime": 240.5658, "eval_samples_per_second": 1.023, "eval_steps_per_second": 1.023, "step": 288 }, { "epoch": 0.7530928873665121, "grad_norm": 11.165517807006836, "learning_rate": 1.4718003342206722e-05, "loss": 2.0551, "step": 292 }, { "epoch": 0.7634092282893411, "grad_norm": 13.10633659362793, "learning_rate": 1.3562919276863844e-05, "loss": 2.0987, "step": 296 }, { "epoch": 0.7634092282893411, "eval_loss": 2.066223382949829, "eval_runtime": 248.9943, "eval_samples_per_second": 0.988, "eval_steps_per_second": 0.988, "step": 296 }, { "epoch": 0.7737255692121701, "grad_norm": 15.164176940917969, "learning_rate": 1.2447889137898293e-05, "loss": 2.113, "step": 300 }, { "epoch": 0.784041910134999, "grad_norm": 13.534417152404785, "learning_rate": 1.1374138636432053e-05, "loss": 2.0985, "step": 304 }, { "epoch": 0.784041910134999, "eval_loss": 2.066317558288574, "eval_runtime": 373.4925, "eval_samples_per_second": 0.659, "eval_steps_per_second": 0.659, "step": 304 }, { "epoch": 0.794358251057828, "grad_norm": 11.72275161743164, "learning_rate": 1.0342848106418368e-05, "loss": 2.0898, "step": 308 }, { "epoch": 0.8046745919806568, "grad_norm": 12.691418647766113, "learning_rate": 9.35515120714447e-06, "loss": 2.0548, "step": 312 }, { "epoch": 0.8046745919806568, "eval_loss": 2.0601634979248047, "eval_runtime": 252.2425, "eval_samples_per_second": 0.975, "eval_steps_per_second": 0.975, "step": 312 }, { "epoch": 0.8149909329034858, "grad_norm": 9.275248527526855, "learning_rate": 8.41213367704224e-06, "loss": 2.0907, "step": 316 }, { "epoch": 0.8253072738263147, "grad_norm": 13.456718444824219, "learning_rate": 7.51483214017637e-06, "loss": 2.0365, "step": 320 }, { "epoch": 0.8253072738263147, "eval_loss": 2.0563478469848633, "eval_runtime": 378.3713, "eval_samples_per_second": 0.65, "eval_steps_per_second": 0.65, "step": 320 }, { "epoch": 0.8356236147491437, "grad_norm": 10.312747955322266, "learning_rate": 6.664232966721995e-06, "loss": 2.0537, "step": 324 }, { "epoch": 0.8459399556719726, "grad_norm": 11.122403144836426, "learning_rate": 5.8612711886848196e-06, "loss": 2.0102, "step": 328 }, { "epoch": 0.8459399556719726, "eval_loss": 2.056385040283203, "eval_runtime": 247.0489, "eval_samples_per_second": 0.996, "eval_steps_per_second": 0.996, "step": 328 }, { "epoch": 0.8562562965948015, "grad_norm": 12.165043830871582, "learning_rate": 5.106829472055202e-06, "loss": 2.0219, "step": 332 }, { "epoch": 0.8665726375176305, "grad_norm": 14.259943008422852, "learning_rate": 4.401737146526219e-06, "loss": 2.0497, "step": 336 }, { "epoch": 0.8665726375176305, "eval_loss": 2.0522236824035645, "eval_runtime": 366.3478, "eval_samples_per_second": 0.671, "eval_steps_per_second": 0.671, "step": 336 }, { "epoch": 0.8768889784404594, "grad_norm": 12.8046293258667, "learning_rate": 3.7467692938425057e-06, "loss": 2.0136, "step": 340 }, { "epoch": 0.8872053193632883, "grad_norm": 15.552796363830566, "learning_rate": 3.142645895781715e-06, "loss": 2.0721, "step": 344 }, { "epoch": 0.8872053193632883, "eval_loss": 2.0470504760742188, "eval_runtime": 245.8972, "eval_samples_per_second": 1.0, "eval_steps_per_second": 1.0, "step": 344 }, { "epoch": 0.8975216602861172, "grad_norm": 10.629472732543945, "learning_rate": 2.5900310427053044e-06, "loss": 2.0459, "step": 348 }, { "epoch": 0.9078380012089462, "grad_norm": 12.79671859741211, "learning_rate": 2.089532203548794e-06, "loss": 2.0812, "step": 352 }, { "epoch": 0.9078380012089462, "eval_loss": 2.0468461513519287, "eval_runtime": 246.3251, "eval_samples_per_second": 0.999, "eval_steps_per_second": 0.999, "step": 352 }, { "epoch": 0.9181543421317752, "grad_norm": 10.172734260559082, "learning_rate": 1.6416995580537664e-06, "loss": 2.0588, "step": 356 }, { "epoch": 0.9284706830546041, "grad_norm": 13.063249588012695, "learning_rate": 1.247025391975698e-06, "loss": 2.0475, "step": 360 }, { "epoch": 0.9284706830546041, "eval_loss": 2.0462241172790527, "eval_runtime": 383.7451, "eval_samples_per_second": 0.641, "eval_steps_per_second": 0.641, "step": 360 }, { "epoch": 0.938787023977433, "grad_norm": 9.2144193649292, "learning_rate": 9.059435559326257e-07, "loss": 2.0669, "step": 364 }, { "epoch": 0.9491033649002619, "grad_norm": 11.035449028015137, "learning_rate": 6.188289884893062e-07, "loss": 2.0687, "step": 368 }, { "epoch": 0.9491033649002619, "eval_loss": 2.045191526412964, "eval_runtime": 250.014, "eval_samples_per_second": 0.984, "eval_steps_per_second": 0.984, "step": 368 }, { "epoch": 0.9594197058230909, "grad_norm": 11.38818073272705, "learning_rate": 3.8599730400115107e-07, "loss": 2.0007, "step": 372 }, { "epoch": 0.9697360467459198, "grad_norm": 11.021515846252441, "learning_rate": 2.0770444567118075e-07, "loss": 2.065, "step": 376 }, { "epoch": 0.9697360467459198, "eval_loss": 2.044952392578125, "eval_runtime": 364.9331, "eval_samples_per_second": 0.674, "eval_steps_per_second": 0.674, "step": 376 }, { "epoch": 0.9800523876687487, "grad_norm": 14.076632499694824, "learning_rate": 8.414640420116305e-08, "loss": 2.0618, "step": 380 }, { "epoch": 0.9903687285915777, "grad_norm": 15.729289054870605, "learning_rate": 1.5459002346324135e-08, "loss": 1.991, "step": 384 }, { "epoch": 0.9903687285915777, "eval_loss": 2.04483962059021, "eval_runtime": 253.6132, "eval_samples_per_second": 0.97, "eval_steps_per_second": 0.97, "step": 384 } ], "logging_steps": 4, "max_steps": 387, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 8, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.5374998322774016e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }