{ "best_metric": 0.9388719201087952, "best_model_checkpoint": "/home/datta0/models/lora_final/Qwen2-7B_magiccoder_ortho/checkpoint-12", "epoch": 0.99836867862969, "eval_steps": 4, "global_step": 153, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0065252854812398045, "grad_norm": 4.3915696144104, "learning_rate": 7.5e-05, "loss": 0.8061, "step": 1 }, { "epoch": 0.013050570962479609, "grad_norm": 3.7475483417510986, "learning_rate": 0.00015, "loss": 0.931, "step": 2 }, { "epoch": 0.026101141924959218, "grad_norm": 5.367969989776611, "learning_rate": 0.0003, "loss": 0.8262, "step": 4 }, { "epoch": 0.026101141924959218, "eval_loss": 1.0883674621582031, "eval_runtime": 24.5353, "eval_samples_per_second": 19.931, "eval_steps_per_second": 2.527, "step": 4 }, { "epoch": 0.03915171288743882, "grad_norm": 11.019035339355469, "learning_rate": 0.00029986665273697545, "loss": 1.0003, "step": 6 }, { "epoch": 0.052202283849918436, "grad_norm": 2.1015965938568115, "learning_rate": 0.0002994668480344693, "loss": 0.9776, "step": 8 }, { "epoch": 0.052202283849918436, "eval_loss": 0.9662861227989197, "eval_runtime": 24.6827, "eval_samples_per_second": 19.811, "eval_steps_per_second": 2.512, "step": 8 }, { "epoch": 0.06525285481239804, "grad_norm": 1.750746726989746, "learning_rate": 0.0002988012967306524, "loss": 0.9319, "step": 10 }, { "epoch": 0.07830342577487764, "grad_norm": 2.2127320766448975, "learning_rate": 0.000297871182151455, "loss": 0.9345, "step": 12 }, { "epoch": 0.07830342577487764, "eval_loss": 0.9388719201087952, "eval_runtime": 24.6723, "eval_samples_per_second": 19.82, "eval_steps_per_second": 2.513, "step": 12 }, { "epoch": 0.09135399673735727, "grad_norm": 1.6279808282852173, "learning_rate": 0.00029667815800665635, "loss": 0.9489, "step": 14 }, { "epoch": 0.10440456769983687, "grad_norm": 1.5321425199508667, "learning_rate": 0.0002952243454496488, "loss": 0.9026, "step": 16 }, { "epoch": 0.10440456769983687, "eval_loss": 0.9481999278068542, "eval_runtime": 24.6231, "eval_samples_per_second": 19.859, "eval_steps_per_second": 2.518, "step": 16 }, { "epoch": 0.11745513866231648, "grad_norm": 1.355367660522461, "learning_rate": 0.0002935123293061047, "loss": 0.9004, "step": 18 }, { "epoch": 0.13050570962479607, "grad_norm": 1.5347142219543457, "learning_rate": 0.0002915451534782506, "loss": 0.9618, "step": 20 }, { "epoch": 0.13050570962479607, "eval_loss": 0.957125186920166, "eval_runtime": 24.5628, "eval_samples_per_second": 19.908, "eval_steps_per_second": 2.524, "step": 20 }, { "epoch": 0.14355628058727568, "grad_norm": 1.5545670986175537, "learning_rate": 0.0002893263155329204, "loss": 0.9457, "step": 22 }, { "epoch": 0.1566068515497553, "grad_norm": 1.4520829916000366, "learning_rate": 0.00028685976048300875, "loss": 0.8685, "step": 24 }, { "epoch": 0.1566068515497553, "eval_loss": 0.9719114899635315, "eval_runtime": 24.4752, "eval_samples_per_second": 19.979, "eval_steps_per_second": 2.533, "step": 24 }, { "epoch": 0.16965742251223492, "grad_norm": 1.3440358638763428, "learning_rate": 0.00028414987377338235, "loss": 1.0084, "step": 26 }, { "epoch": 0.18270799347471453, "grad_norm": 1.43779718875885, "learning_rate": 0.0002812014734837191, "loss": 0.8834, "step": 28 }, { "epoch": 0.18270799347471453, "eval_loss": 0.9751714468002319, "eval_runtime": 56.0537, "eval_samples_per_second": 8.724, "eval_steps_per_second": 1.106, "step": 28 }, { "epoch": 0.19575856443719414, "grad_norm": 1.4336111545562744, "learning_rate": 0.0002780198017621379, "loss": 0.9617, "step": 30 }, { "epoch": 0.20880913539967375, "grad_norm": 1.291685700416565, "learning_rate": 0.00027461051550485116, "loss": 1.0185, "step": 32 }, { "epoch": 0.20880913539967375, "eval_loss": 0.987638533115387, "eval_runtime": 59.7835, "eval_samples_per_second": 8.18, "eval_steps_per_second": 1.037, "step": 32 }, { "epoch": 0.22185970636215335, "grad_norm": 1.3673038482666016, "learning_rate": 0.00027097967629840906, "loss": 0.9289, "step": 34 }, { "epoch": 0.23491027732463296, "grad_norm": 1.321115255355835, "learning_rate": 0.0002671337396424204, "loss": 0.9354, "step": 36 }, { "epoch": 0.23491027732463296, "eval_loss": 0.9922739863395691, "eval_runtime": 58.5264, "eval_samples_per_second": 8.355, "eval_steps_per_second": 1.059, "step": 36 }, { "epoch": 0.24796084828711257, "grad_norm": 1.3209092617034912, "learning_rate": 0.00026307954347190983, "loss": 0.9003, "step": 38 }, { "epoch": 0.26101141924959215, "grad_norm": 1.2740062475204468, "learning_rate": 0.00025882429599971866, "loss": 0.9734, "step": 40 }, { "epoch": 0.26101141924959215, "eval_loss": 0.9982444047927856, "eval_runtime": 56.4488, "eval_samples_per_second": 8.663, "eval_steps_per_second": 1.098, "step": 40 }, { "epoch": 0.2740619902120718, "grad_norm": 1.3060563802719116, "learning_rate": 0.0002543755629005657, "loss": 0.9583, "step": 42 }, { "epoch": 0.28711256117455136, "grad_norm": 1.2693545818328857, "learning_rate": 0.0002497412538595537, "loss": 1.034, "step": 44 }, { "epoch": 0.28711256117455136, "eval_loss": 1.0034517049789429, "eval_runtime": 57.2987, "eval_samples_per_second": 8.534, "eval_steps_per_second": 1.082, "step": 44 }, { "epoch": 0.300163132137031, "grad_norm": 1.3035016059875488, "learning_rate": 0.00024492960850903755, "loss": 0.9648, "step": 46 }, { "epoch": 0.3132137030995106, "grad_norm": 1.4393730163574219, "learning_rate": 0.00023994918177885902, "loss": 1.0067, "step": 48 }, { "epoch": 0.3132137030995106, "eval_loss": 1.0048160552978516, "eval_runtime": 56.3331, "eval_samples_per_second": 8.681, "eval_steps_per_second": 1.101, "step": 48 }, { "epoch": 0.3262642740619902, "grad_norm": 1.5344454050064087, "learning_rate": 0.0002348088286859938, "loss": 1.0498, "step": 50 }, { "epoch": 0.33931484502446985, "grad_norm": 1.2956377267837524, "learning_rate": 0.00022951768859065402, "loss": 0.932, "step": 52 }, { "epoch": 0.33931484502446985, "eval_loss": 1.00808584690094, "eval_runtime": 56.5841, "eval_samples_per_second": 8.642, "eval_steps_per_second": 1.096, "step": 52 }, { "epoch": 0.3523654159869494, "grad_norm": 1.5058661699295044, "learning_rate": 0.0002240851689468395, "loss": 0.9455, "step": 54 }, { "epoch": 0.36541598694942906, "grad_norm": 1.3148020505905151, "learning_rate": 0.00021852092857622808, "loss": 0.9407, "step": 56 }, { "epoch": 0.36541598694942906, "eval_loss": 1.006118655204773, "eval_runtime": 57.5921, "eval_samples_per_second": 8.491, "eval_steps_per_second": 1.077, "step": 56 }, { "epoch": 0.37846655791190864, "grad_norm": 1.1989065408706665, "learning_rate": 0.00021283486049514277, "loss": 1.023, "step": 58 }, { "epoch": 0.3915171288743883, "grad_norm": 1.3775067329406738, "learning_rate": 0.00020703707432513004, "loss": 0.9682, "step": 60 }, { "epoch": 0.3915171288743883, "eval_loss": 1.0053811073303223, "eval_runtime": 57.2201, "eval_samples_per_second": 8.546, "eval_steps_per_second": 1.084, "step": 60 }, { "epoch": 0.40456769983686786, "grad_norm": 1.320212960243225, "learning_rate": 0.00020113787831842152, "loss": 0.8986, "step": 62 }, { "epoch": 0.4176182707993475, "grad_norm": 1.325500726699829, "learning_rate": 0.0001951477610302378, "loss": 1.0224, "step": 64 }, { "epoch": 0.4176182707993475, "eval_loss": 1.0092753171920776, "eval_runtime": 56.0548, "eval_samples_per_second": 8.724, "eval_steps_per_second": 1.106, "step": 64 }, { "epoch": 0.43066884176182707, "grad_norm": 1.2880396842956543, "learning_rate": 0.0001890773726705198, "loss": 0.9943, "step": 66 }, { "epoch": 0.4437194127243067, "grad_norm": 1.237645149230957, "learning_rate": 0.00018293750616824443, "loss": 1.0145, "step": 68 }, { "epoch": 0.4437194127243067, "eval_loss": 1.009407639503479, "eval_runtime": 24.7188, "eval_samples_per_second": 19.783, "eval_steps_per_second": 2.508, "step": 68 }, { "epoch": 0.4567699836867863, "grad_norm": 1.1990931034088135, "learning_rate": 0.00017673907798199052, "loss": 1.0333, "step": 70 }, { "epoch": 0.4698205546492659, "grad_norm": 1.2862218618392944, "learning_rate": 0.000170493108690874, "loss": 0.9756, "step": 72 }, { "epoch": 0.4698205546492659, "eval_loss": 1.010068416595459, "eval_runtime": 24.7249, "eval_samples_per_second": 19.778, "eval_steps_per_second": 2.508, "step": 72 }, { "epoch": 0.4828711256117455, "grad_norm": 1.2775288820266724, "learning_rate": 0.00016421070340036023, "loss": 1.0124, "step": 74 }, { "epoch": 0.49592169657422513, "grad_norm": 1.3676966428756714, "learning_rate": 0.00015790303199779193, "loss": 0.9968, "step": 76 }, { "epoch": 0.49592169657422513, "eval_loss": 1.0086660385131836, "eval_runtime": 24.6773, "eval_samples_per_second": 19.816, "eval_steps_per_second": 2.512, "step": 76 }, { "epoch": 0.5089722675367048, "grad_norm": 1.2739876508712769, "learning_rate": 0.00015158130929273695, "loss": 0.9405, "step": 78 }, { "epoch": 0.5220228384991843, "grad_norm": 1.3879481554031372, "learning_rate": 0.00014525677507746615, "loss": 0.9566, "step": 80 }, { "epoch": 0.5220228384991843, "eval_loss": 1.0094032287597656, "eval_runtime": 24.6744, "eval_samples_per_second": 19.818, "eval_steps_per_second": 2.513, "step": 80 }, { "epoch": 0.5350734094616639, "grad_norm": 1.246418833732605, "learning_rate": 0.00013894067414301314, "loss": 1.0481, "step": 82 }, { "epoch": 0.5481239804241436, "grad_norm": 1.3928742408752441, "learning_rate": 0.0001326442362863458, "loss": 1.0394, "step": 84 }, { "epoch": 0.5481239804241436, "eval_loss": 1.008681297302246, "eval_runtime": 24.6024, "eval_samples_per_second": 19.876, "eval_steps_per_second": 2.52, "step": 84 }, { "epoch": 0.5611745513866232, "grad_norm": 1.2170292139053345, "learning_rate": 0.00012637865634419735, "loss": 0.9979, "step": 86 }, { "epoch": 0.5742251223491027, "grad_norm": 1.3591171503067017, "learning_rate": 0.00012015507428905507, "loss": 0.9546, "step": 88 }, { "epoch": 0.5742251223491027, "eval_loss": 1.0074015855789185, "eval_runtime": 24.5002, "eval_samples_per_second": 19.959, "eval_steps_per_second": 2.531, "step": 88 }, { "epoch": 0.5872756933115824, "grad_norm": 1.336329460144043, "learning_rate": 0.00011398455542269575, "loss": 0.9125, "step": 90 }, { "epoch": 0.600326264274062, "grad_norm": 1.2378321886062622, "learning_rate": 0.00010787807070248305, "loss": 1.0347, "step": 92 }, { "epoch": 0.600326264274062, "eval_loss": 1.0086424350738525, "eval_runtime": 24.4221, "eval_samples_per_second": 20.023, "eval_steps_per_second": 2.539, "step": 92 }, { "epoch": 0.6133768352365416, "grad_norm": 1.3458659648895264, "learning_rate": 0.00010184647723540557, "loss": 0.9567, "step": 94 }, { "epoch": 0.6264274061990212, "grad_norm": 1.251621961593628, "learning_rate": 9.590049897453668e-05, "loss": 0.9639, "step": 96 }, { "epoch": 0.6264274061990212, "eval_loss": 1.004166841506958, "eval_runtime": 56.7542, "eval_samples_per_second": 8.616, "eval_steps_per_second": 1.092, "step": 96 }, { "epoch": 0.6394779771615008, "grad_norm": 1.15924870967865, "learning_rate": 9.005070765223768e-05, "loss": 1.0447, "step": 98 }, { "epoch": 0.6525285481239804, "grad_norm": 1.4097235202789307, "learning_rate": 8.430750398400308e-05, "loss": 1.0543, "step": 100 }, { "epoch": 0.6525285481239804, "eval_loss": 1.002665638923645, "eval_runtime": 55.6845, "eval_samples_per_second": 8.782, "eval_steps_per_second": 1.113, "step": 100 }, { "epoch": 0.6655791190864601, "grad_norm": 1.3108314275741577, "learning_rate": 7.868109917636821e-05, "loss": 0.9645, "step": 102 }, { "epoch": 0.6786296900489397, "grad_norm": 1.2921593189239502, "learning_rate": 7.318149677175675e-05, "loss": 0.9346, "step": 104 }, { "epoch": 0.6786296900489397, "eval_loss": 1.003048300743103, "eval_runtime": 57.4498, "eval_samples_per_second": 8.512, "eval_steps_per_second": 1.079, "step": 104 }, { "epoch": 0.6916802610114192, "grad_norm": 1.2615606784820557, "learning_rate": 6.781847486254697e-05, "loss": 0.9565, "step": 106 }, { "epoch": 0.7047308319738989, "grad_norm": 1.3441969156265259, "learning_rate": 6.260156870598071e-05, "loss": 0.9744, "step": 108 }, { "epoch": 0.7047308319738989, "eval_loss": 1.0019466876983643, "eval_runtime": 56.5017, "eval_samples_per_second": 8.655, "eval_steps_per_second": 1.097, "step": 108 }, { "epoch": 0.7177814029363785, "grad_norm": 1.1984766721725464, "learning_rate": 5.7540053770823644e-05, "loss": 0.9558, "step": 110 }, { "epoch": 0.7308319738988581, "grad_norm": 1.259084701538086, "learning_rate": 5.264292924592073e-05, "loss": 0.9546, "step": 112 }, { "epoch": 0.7308319738988581, "eval_loss": 0.9984883069992065, "eval_runtime": 56.938, "eval_samples_per_second": 8.588, "eval_steps_per_second": 1.089, "step": 112 }, { "epoch": 0.7438825448613376, "grad_norm": 1.2619880437850952, "learning_rate": 4.791890203996634e-05, "loss": 0.9784, "step": 114 }, { "epoch": 0.7569331158238173, "grad_norm": 1.1600760221481323, "learning_rate": 4.3376371300938786e-05, "loss": 0.9138, "step": 116 }, { "epoch": 0.7569331158238173, "eval_loss": 0.9968593716621399, "eval_runtime": 56.3466, "eval_samples_per_second": 8.678, "eval_steps_per_second": 1.1, "step": 116 }, { "epoch": 0.7699836867862969, "grad_norm": 1.2669286727905273, "learning_rate": 3.9023413482721426e-05, "loss": 0.9714, "step": 118 }, { "epoch": 0.7830342577487766, "grad_norm": 1.2815442085266113, "learning_rate": 3.4867767985462507e-05, "loss": 0.9026, "step": 120 }, { "epoch": 0.7830342577487766, "eval_loss": 0.9961332082748413, "eval_runtime": 56.6715, "eval_samples_per_second": 8.629, "eval_steps_per_second": 1.094, "step": 120 }, { "epoch": 0.7960848287112561, "grad_norm": 1.2086176872253418, "learning_rate": 3.09168233952042e-05, "loss": 1.0291, "step": 122 }, { "epoch": 0.8091353996737357, "grad_norm": 1.2728592157363892, "learning_rate": 2.717760434724613e-05, "loss": 0.9746, "step": 124 }, { "epoch": 0.8091353996737357, "eval_loss": 0.9953013062477112, "eval_runtime": 56.695, "eval_samples_per_second": 8.625, "eval_steps_per_second": 1.094, "step": 124 }, { "epoch": 0.8221859706362153, "grad_norm": 1.163971185684204, "learning_rate": 2.3656759036600187e-05, "loss": 0.9733, "step": 126 }, { "epoch": 0.835236541598695, "grad_norm": 1.2905343770980835, "learning_rate": 2.0360547397742523e-05, "loss": 0.9453, "step": 128 }, { "epoch": 0.835236541598695, "eval_loss": 0.9950230717658997, "eval_runtime": 57.4352, "eval_samples_per_second": 8.514, "eval_steps_per_second": 1.079, "step": 128 }, { "epoch": 0.8482871125611745, "grad_norm": 1.2126384973526, "learning_rate": 1.7294829974678338e-05, "loss": 0.922, "step": 130 }, { "epoch": 0.8613376835236541, "grad_norm": 1.3399946689605713, "learning_rate": 1.4465057501108546e-05, "loss": 1.0311, "step": 132 }, { "epoch": 0.8613376835236541, "eval_loss": 0.9933781027793884, "eval_runtime": 56.8077, "eval_samples_per_second": 8.608, "eval_steps_per_second": 1.091, "step": 132 }, { "epoch": 0.8743882544861338, "grad_norm": 1.2741433382034302, "learning_rate": 1.1876261209224314e-05, "loss": 0.9365, "step": 134 }, { "epoch": 0.8874388254486134, "grad_norm": 1.1750285625457764, "learning_rate": 9.533043884359615e-06, "loss": 0.971, "step": 136 }, { "epoch": 0.8874388254486134, "eval_loss": 0.992695152759552, "eval_runtime": 24.7252, "eval_samples_per_second": 19.777, "eval_steps_per_second": 2.508, "step": 136 }, { "epoch": 0.9004893964110929, "grad_norm": 1.1639913320541382, "learning_rate": 7.439571681407053e-06, "loss": 1.0128, "step": 138 }, { "epoch": 0.9135399673735726, "grad_norm": 1.2708672285079956, "learning_rate": 5.59956671754635e-06, "loss": 0.9957, "step": 140 }, { "epoch": 0.9135399673735726, "eval_loss": 0.9919000864028931, "eval_runtime": 24.7098, "eval_samples_per_second": 19.79, "eval_steps_per_second": 2.509, "step": 140 }, { "epoch": 0.9265905383360522, "grad_norm": 1.3160277605056763, "learning_rate": 4.016300454455945e-06, "loss": 1.0054, "step": 142 }, { "epoch": 0.9396411092985318, "grad_norm": 1.325445532798767, "learning_rate": 2.692587881773478e-06, "loss": 0.9502, "step": 144 }, { "epoch": 0.9396411092985318, "eval_loss": 0.9917099475860596, "eval_runtime": 24.7029, "eval_samples_per_second": 19.795, "eval_steps_per_second": 2.51, "step": 144 }, { "epoch": 0.9526916802610114, "grad_norm": 1.1836706399917603, "learning_rate": 1.6307825121469164e-06, "loss": 0.991, "step": 146 }, { "epoch": 0.965742251223491, "grad_norm": 1.2473053932189941, "learning_rate": 8.327721967749779e-07, "loss": 1.0133, "step": 148 }, { "epoch": 0.965742251223491, "eval_loss": 0.9915127158164978, "eval_runtime": 24.6188, "eval_samples_per_second": 19.863, "eval_steps_per_second": 2.518, "step": 148 }, { "epoch": 0.9787928221859706, "grad_norm": 1.237483024597168, "learning_rate": 2.9997576887660913e-07, "loss": 0.9316, "step": 150 }, { "epoch": 0.9918433931484503, "grad_norm": 1.279980182647705, "learning_rate": 3.334052105728458e-08, "loss": 0.9684, "step": 152 }, { "epoch": 0.9918433931484503, "eval_loss": 0.9916173219680786, "eval_runtime": 24.5672, "eval_samples_per_second": 19.905, "eval_steps_per_second": 2.524, "step": 152 } ], "logging_steps": 2, "max_steps": 153, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.85963932651946e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }