{ "best_metric": 0.9220191240310669, "best_model_checkpoint": "/home/datta0/models/lora_final/Qwen2-7B_magiccoder_default/checkpoint-4", "epoch": 0.99836867862969, "eval_steps": 4, "global_step": 153, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0065252854812398045, "grad_norm": 6.940319538116455, "learning_rate": 7.5e-05, "loss": 0.8051, "step": 1 }, { "epoch": 0.013050570962479609, "grad_norm": 5.597632884979248, "learning_rate": 0.00015, "loss": 0.9302, "step": 2 }, { "epoch": 0.026101141924959218, "grad_norm": 4.383840084075928, "learning_rate": 0.0003, "loss": 0.8215, "step": 4 }, { "epoch": 0.026101141924959218, "eval_loss": 0.9220191240310669, "eval_runtime": 24.7526, "eval_samples_per_second": 19.756, "eval_steps_per_second": 2.505, "step": 4 }, { "epoch": 0.03915171288743882, "grad_norm": 3.2939515113830566, "learning_rate": 0.00029986665273697545, "loss": 0.8629, "step": 6 }, { "epoch": 0.052202283849918436, "grad_norm": 49.209835052490234, "learning_rate": 0.0002994668480344693, "loss": 0.9247, "step": 8 }, { "epoch": 0.052202283849918436, "eval_loss": 0.9779874086380005, "eval_runtime": 24.7429, "eval_samples_per_second": 19.763, "eval_steps_per_second": 2.506, "step": 8 }, { "epoch": 0.06525285481239804, "grad_norm": 2.9049675464630127, "learning_rate": 0.0002988012967306524, "loss": 0.9425, "step": 10 }, { "epoch": 0.07830342577487764, "grad_norm": 2.8929295539855957, "learning_rate": 0.000297871182151455, "loss": 0.9611, "step": 12 }, { "epoch": 0.07830342577487764, "eval_loss": 0.9693307876586914, "eval_runtime": 24.7094, "eval_samples_per_second": 19.79, "eval_steps_per_second": 2.509, "step": 12 }, { "epoch": 0.09135399673735727, "grad_norm": 2.5417490005493164, "learning_rate": 0.00029667815800665635, "loss": 0.9792, "step": 14 }, { "epoch": 0.10440456769983687, "grad_norm": 2.707855224609375, "learning_rate": 0.0002952243454496488, "loss": 0.9392, "step": 16 }, { "epoch": 0.10440456769983687, "eval_loss": 0.9866985082626343, "eval_runtime": 24.6246, "eval_samples_per_second": 19.858, "eval_steps_per_second": 2.518, "step": 16 }, { "epoch": 0.11745513866231648, "grad_norm": 7.373922348022461, "learning_rate": 0.0002935123293061047, "loss": 0.9393, "step": 18 }, { "epoch": 0.13050570962479607, "grad_norm": 2.5633223056793213, "learning_rate": 0.0002915451534782506, "loss": 1.0135, "step": 20 }, { "epoch": 0.13050570962479607, "eval_loss": 1.0108040571212769, "eval_runtime": 24.6129, "eval_samples_per_second": 19.868, "eval_steps_per_second": 2.519, "step": 20 }, { "epoch": 0.14355628058727568, "grad_norm": 2.2816696166992188, "learning_rate": 0.0002893263155329204, "loss": 1.0003, "step": 22 }, { "epoch": 0.1566068515497553, "grad_norm": 2.6352310180664062, "learning_rate": 0.00028685976048300875, "loss": 0.9152, "step": 24 }, { "epoch": 0.1566068515497553, "eval_loss": 1.0166871547698975, "eval_runtime": 24.4896, "eval_samples_per_second": 19.968, "eval_steps_per_second": 2.532, "step": 24 }, { "epoch": 0.16965742251223492, "grad_norm": 2.428823232650757, "learning_rate": 0.00028414987377338235, "loss": 1.0468, "step": 26 }, { "epoch": 0.18270799347471453, "grad_norm": 2.590581178665161, "learning_rate": 0.0002812014734837191, "loss": 0.9298, "step": 28 }, { "epoch": 0.18270799347471453, "eval_loss": 1.0250943899154663, "eval_runtime": 54.7083, "eval_samples_per_second": 8.938, "eval_steps_per_second": 1.133, "step": 28 }, { "epoch": 0.19575856443719414, "grad_norm": 2.5896878242492676, "learning_rate": 0.0002780198017621379, "loss": 1.0095, "step": 30 }, { "epoch": 0.20880913539967375, "grad_norm": 2.394001007080078, "learning_rate": 0.00027461051550485116, "loss": 1.0625, "step": 32 }, { "epoch": 0.20880913539967375, "eval_loss": 1.0349429845809937, "eval_runtime": 54.8183, "eval_samples_per_second": 8.92, "eval_steps_per_second": 1.131, "step": 32 }, { "epoch": 0.22185970636215335, "grad_norm": 2.3402562141418457, "learning_rate": 0.00027097967629840906, "loss": 0.9817, "step": 34 }, { "epoch": 0.23491027732463296, "grad_norm": 2.0935347080230713, "learning_rate": 0.0002671337396424204, "loss": 0.9695, "step": 36 }, { "epoch": 0.23491027732463296, "eval_loss": 1.0332014560699463, "eval_runtime": 55.2086, "eval_samples_per_second": 8.857, "eval_steps_per_second": 1.123, "step": 36 }, { "epoch": 0.24796084828711257, "grad_norm": 1.9977389574050903, "learning_rate": 0.00026307954347190983, "loss": 0.9429, "step": 38 }, { "epoch": 0.26101141924959215, "grad_norm": 2.104321241378784, "learning_rate": 0.00025882429599971866, "loss": 1.0104, "step": 40 }, { "epoch": 0.26101141924959215, "eval_loss": 1.0390156507492065, "eval_runtime": 55.2819, "eval_samples_per_second": 8.846, "eval_steps_per_second": 1.122, "step": 40 }, { "epoch": 0.2740619902120718, "grad_norm": 1.943311095237732, "learning_rate": 0.0002543755629005657, "loss": 0.9952, "step": 42 }, { "epoch": 0.28711256117455136, "grad_norm": 2.2244155406951904, "learning_rate": 0.0002497412538595537, "loss": 1.0721, "step": 44 }, { "epoch": 0.28711256117455136, "eval_loss": 1.0405514240264893, "eval_runtime": 55.1226, "eval_samples_per_second": 8.871, "eval_steps_per_second": 1.125, "step": 44 }, { "epoch": 0.300163132137031, "grad_norm": 2.213677406311035, "learning_rate": 0.00024492960850903755, "loss": 0.9997, "step": 46 }, { "epoch": 0.3132137030995106, "grad_norm": 2.108431100845337, "learning_rate": 0.00023994918177885902, "loss": 1.0397, "step": 48 }, { "epoch": 0.3132137030995106, "eval_loss": 1.0448977947235107, "eval_runtime": 55.076, "eval_samples_per_second": 8.879, "eval_steps_per_second": 1.126, "step": 48 }, { "epoch": 0.3262642740619902, "grad_norm": 2.3814570903778076, "learning_rate": 0.0002348088286859938, "loss": 1.0839, "step": 50 }, { "epoch": 0.33931484502446985, "grad_norm": 2.261181116104126, "learning_rate": 0.00022951768859065402, "loss": 0.9623, "step": 52 }, { "epoch": 0.33931484502446985, "eval_loss": 1.0447765588760376, "eval_runtime": 55.341, "eval_samples_per_second": 8.836, "eval_steps_per_second": 1.12, "step": 52 }, { "epoch": 0.3523654159869494, "grad_norm": 2.159951686859131, "learning_rate": 0.0002240851689468395, "loss": 0.9753, "step": 54 }, { "epoch": 0.36541598694942906, "grad_norm": 2.21645188331604, "learning_rate": 0.00021852092857622808, "loss": 0.9735, "step": 56 }, { "epoch": 0.36541598694942906, "eval_loss": 1.0435727834701538, "eval_runtime": 54.9591, "eval_samples_per_second": 8.898, "eval_steps_per_second": 1.128, "step": 56 }, { "epoch": 0.37846655791190864, "grad_norm": 2.0440618991851807, "learning_rate": 0.00021283486049514277, "loss": 1.051, "step": 58 }, { "epoch": 0.3915171288743883, "grad_norm": 2.2410900592803955, "learning_rate": 0.00020703707432513004, "loss": 1.0016, "step": 60 }, { "epoch": 0.3915171288743883, "eval_loss": 1.043070673942566, "eval_runtime": 55.0676, "eval_samples_per_second": 8.88, "eval_steps_per_second": 1.126, "step": 60 }, { "epoch": 0.40456769983686786, "grad_norm": 2.09979248046875, "learning_rate": 0.00020113787831842152, "loss": 0.9375, "step": 62 }, { "epoch": 0.4176182707993475, "grad_norm": 2.2402355670928955, "learning_rate": 0.0001951477610302378, "loss": 1.0557, "step": 64 }, { "epoch": 0.4176182707993475, "eval_loss": 1.0401309728622437, "eval_runtime": 55.6078, "eval_samples_per_second": 8.794, "eval_steps_per_second": 1.115, "step": 64 }, { "epoch": 0.43066884176182707, "grad_norm": 2.0318586826324463, "learning_rate": 0.0001890773726705198, "loss": 1.0214, "step": 66 }, { "epoch": 0.4437194127243067, "grad_norm": 2.138606309890747, "learning_rate": 0.00018293750616824443, "loss": 1.0377, "step": 68 }, { "epoch": 0.4437194127243067, "eval_loss": 1.0372790098190308, "eval_runtime": 24.7257, "eval_samples_per_second": 19.777, "eval_steps_per_second": 2.508, "step": 68 }, { "epoch": 0.4567699836867863, "grad_norm": 1.7595700025558472, "learning_rate": 0.00017673907798199052, "loss": 1.0546, "step": 70 }, { "epoch": 0.4698205546492659, "grad_norm": 1.987815022468567, "learning_rate": 0.000170493108690874, "loss": 1.0022, "step": 72 }, { "epoch": 0.4698205546492659, "eval_loss": 1.0360997915267944, "eval_runtime": 24.7474, "eval_samples_per_second": 19.76, "eval_steps_per_second": 2.505, "step": 72 }, { "epoch": 0.4828711256117455, "grad_norm": 1.9406994581222534, "learning_rate": 0.00016421070340036023, "loss": 1.0372, "step": 74 }, { "epoch": 0.49592169657422513, "grad_norm": 2.0799319744110107, "learning_rate": 0.00015790303199779193, "loss": 1.0193, "step": 76 }, { "epoch": 0.49592169657422513, "eval_loss": 1.0328214168548584, "eval_runtime": 24.7131, "eval_samples_per_second": 19.787, "eval_steps_per_second": 2.509, "step": 76 }, { "epoch": 0.5089722675367048, "grad_norm": 2.057676315307617, "learning_rate": 0.00015158130929273695, "loss": 0.9597, "step": 78 }, { "epoch": 0.5220228384991843, "grad_norm": 2.00854754447937, "learning_rate": 0.00014525677507746615, "loss": 0.9806, "step": 80 }, { "epoch": 0.5220228384991843, "eval_loss": 1.0301356315612793, "eval_runtime": 24.7052, "eval_samples_per_second": 19.793, "eval_steps_per_second": 2.51, "step": 80 }, { "epoch": 0.5350734094616639, "grad_norm": 1.9844895601272583, "learning_rate": 0.00013894067414301314, "loss": 1.068, "step": 82 }, { "epoch": 0.5481239804241436, "grad_norm": 1.9492027759552002, "learning_rate": 0.0001326442362863458, "loss": 1.0542, "step": 84 }, { "epoch": 0.5481239804241436, "eval_loss": 1.0262655019760132, "eval_runtime": 24.6275, "eval_samples_per_second": 19.856, "eval_steps_per_second": 2.518, "step": 84 }, { "epoch": 0.5611745513866232, "grad_norm": 1.8868807554244995, "learning_rate": 0.00012637865634419735, "loss": 1.0136, "step": 86 }, { "epoch": 0.5742251223491027, "grad_norm": 1.9024137258529663, "learning_rate": 0.00012015507428905507, "loss": 0.9692, "step": 88 }, { "epoch": 0.5742251223491027, "eval_loss": 1.024366021156311, "eval_runtime": 24.5294, "eval_samples_per_second": 19.935, "eval_steps_per_second": 2.528, "step": 88 }, { "epoch": 0.5872756933115824, "grad_norm": 2.2372443675994873, "learning_rate": 0.00011398455542269575, "loss": 0.9305, "step": 90 }, { "epoch": 0.600326264274062, "grad_norm": 1.8708783388137817, "learning_rate": 0.00010787807070248305, "loss": 1.0464, "step": 92 }, { "epoch": 0.600326264274062, "eval_loss": 1.0215392112731934, "eval_runtime": 24.4525, "eval_samples_per_second": 19.998, "eval_steps_per_second": 2.536, "step": 92 }, { "epoch": 0.6133768352365416, "grad_norm": 2.0300116539001465, "learning_rate": 0.00010184647723540557, "loss": 0.9709, "step": 94 }, { "epoch": 0.6264274061990212, "grad_norm": 2.0198493003845215, "learning_rate": 9.590049897453668e-05, "loss": 0.9771, "step": 96 }, { "epoch": 0.6264274061990212, "eval_loss": 1.01658034324646, "eval_runtime": 53.8396, "eval_samples_per_second": 9.083, "eval_steps_per_second": 1.152, "step": 96 }, { "epoch": 0.6394779771615008, "grad_norm": 1.8200911283493042, "learning_rate": 9.005070765223768e-05, "loss": 1.0565, "step": 98 }, { "epoch": 0.6525285481239804, "grad_norm": 2.173635721206665, "learning_rate": 8.430750398400308e-05, "loss": 1.0659, "step": 100 }, { "epoch": 0.6525285481239804, "eval_loss": 1.0145906209945679, "eval_runtime": 55.4651, "eval_samples_per_second": 8.816, "eval_steps_per_second": 1.118, "step": 100 }, { "epoch": 0.6655791190864601, "grad_norm": 1.9142309427261353, "learning_rate": 7.868109917636821e-05, "loss": 0.9761, "step": 102 }, { "epoch": 0.6786296900489397, "grad_norm": 1.9679898023605347, "learning_rate": 7.318149677175675e-05, "loss": 0.9476, "step": 104 }, { "epoch": 0.6786296900489397, "eval_loss": 1.0106278657913208, "eval_runtime": 55.6719, "eval_samples_per_second": 8.784, "eval_steps_per_second": 1.114, "step": 104 }, { "epoch": 0.6916802610114192, "grad_norm": 1.9258702993392944, "learning_rate": 6.781847486254697e-05, "loss": 0.963, "step": 106 }, { "epoch": 0.7047308319738989, "grad_norm": 2.029904842376709, "learning_rate": 6.260156870598071e-05, "loss": 0.983, "step": 108 }, { "epoch": 0.7047308319738989, "eval_loss": 1.0074269771575928, "eval_runtime": 57.0045, "eval_samples_per_second": 8.578, "eval_steps_per_second": 1.088, "step": 108 }, { "epoch": 0.7177814029363785, "grad_norm": 1.779940128326416, "learning_rate": 5.7540053770823644e-05, "loss": 0.9698, "step": 110 }, { "epoch": 0.7308319738988581, "grad_norm": 2.0144851207733154, "learning_rate": 5.264292924592073e-05, "loss": 0.9585, "step": 112 }, { "epoch": 0.7308319738988581, "eval_loss": 1.0034711360931396, "eval_runtime": 57.5133, "eval_samples_per_second": 8.502, "eval_steps_per_second": 1.078, "step": 112 }, { "epoch": 0.7438825448613376, "grad_norm": 1.9726147651672363, "learning_rate": 4.791890203996634e-05, "loss": 0.9865, "step": 114 }, { "epoch": 0.7569331158238173, "grad_norm": 1.7042125463485718, "learning_rate": 4.3376371300938786e-05, "loss": 0.9193, "step": 116 }, { "epoch": 0.7569331158238173, "eval_loss": 0.9996815323829651, "eval_runtime": 57.7466, "eval_samples_per_second": 8.468, "eval_steps_per_second": 1.074, "step": 116 }, { "epoch": 0.7699836867862969, "grad_norm": 1.8329825401306152, "learning_rate": 3.9023413482721426e-05, "loss": 0.9742, "step": 118 }, { "epoch": 0.7830342577487766, "grad_norm": 1.861943006515503, "learning_rate": 3.4867767985462507e-05, "loss": 0.9041, "step": 120 }, { "epoch": 0.7830342577487766, "eval_loss": 0.9974753260612488, "eval_runtime": 55.1354, "eval_samples_per_second": 8.869, "eval_steps_per_second": 1.125, "step": 120 }, { "epoch": 0.7960848287112561, "grad_norm": 1.8173584938049316, "learning_rate": 3.09168233952042e-05, "loss": 1.026, "step": 122 }, { "epoch": 0.8091353996737357, "grad_norm": 1.79753839969635, "learning_rate": 2.717760434724613e-05, "loss": 0.9697, "step": 124 }, { "epoch": 0.8091353996737357, "eval_loss": 0.9954367876052856, "eval_runtime": 55.5077, "eval_samples_per_second": 8.81, "eval_steps_per_second": 1.117, "step": 124 }, { "epoch": 0.8221859706362153, "grad_norm": 1.7292028665542603, "learning_rate": 2.3656759036600187e-05, "loss": 0.9747, "step": 126 }, { "epoch": 0.835236541598695, "grad_norm": 1.9664617776870728, "learning_rate": 2.0360547397742523e-05, "loss": 0.9464, "step": 128 }, { "epoch": 0.835236541598695, "eval_loss": 0.9932743906974792, "eval_runtime": 57.4493, "eval_samples_per_second": 8.512, "eval_steps_per_second": 1.079, "step": 128 }, { "epoch": 0.8482871125611745, "grad_norm": 1.82283616065979, "learning_rate": 1.7294829974678338e-05, "loss": 0.9256, "step": 130 }, { "epoch": 0.8613376835236541, "grad_norm": 1.9917670488357544, "learning_rate": 1.4465057501108546e-05, "loss": 1.0252, "step": 132 }, { "epoch": 0.8613376835236541, "eval_loss": 0.9916940927505493, "eval_runtime": 55.6062, "eval_samples_per_second": 8.794, "eval_steps_per_second": 1.115, "step": 132 }, { "epoch": 0.8743882544861338, "grad_norm": 1.8504716157913208, "learning_rate": 1.1876261209224314e-05, "loss": 0.9374, "step": 134 }, { "epoch": 0.8874388254486134, "grad_norm": 1.6590113639831543, "learning_rate": 9.533043884359615e-06, "loss": 0.9665, "step": 136 }, { "epoch": 0.8874388254486134, "eval_loss": 0.9909241199493408, "eval_runtime": 24.7544, "eval_samples_per_second": 19.754, "eval_steps_per_second": 2.505, "step": 136 }, { "epoch": 0.9004893964110929, "grad_norm": 1.7258245944976807, "learning_rate": 7.439571681407053e-06, "loss": 1.0069, "step": 138 }, { "epoch": 0.9135399673735726, "grad_norm": 1.87185537815094, "learning_rate": 5.59956671754635e-06, "loss": 0.9948, "step": 140 }, { "epoch": 0.9135399673735726, "eval_loss": 0.9903515577316284, "eval_runtime": 24.7378, "eval_samples_per_second": 19.767, "eval_steps_per_second": 2.506, "step": 140 }, { "epoch": 0.9265905383360522, "grad_norm": 1.9415644407272339, "learning_rate": 4.016300454455945e-06, "loss": 1.0008, "step": 142 }, { "epoch": 0.9396411092985318, "grad_norm": 1.9181973934173584, "learning_rate": 2.692587881773478e-06, "loss": 0.946, "step": 144 }, { "epoch": 0.9396411092985318, "eval_loss": 0.9896851778030396, "eval_runtime": 24.7297, "eval_samples_per_second": 19.774, "eval_steps_per_second": 2.507, "step": 144 }, { "epoch": 0.9526916802610114, "grad_norm": 1.8300237655639648, "learning_rate": 1.6307825121469164e-06, "loss": 0.9866, "step": 146 }, { "epoch": 0.965742251223491, "grad_norm": 1.893951177597046, "learning_rate": 8.327721967749779e-07, "loss": 1.0095, "step": 148 }, { "epoch": 0.965742251223491, "eval_loss": 0.9895658493041992, "eval_runtime": 24.6594, "eval_samples_per_second": 19.83, "eval_steps_per_second": 2.514, "step": 148 }, { "epoch": 0.9787928221859706, "grad_norm": 1.895480990409851, "learning_rate": 2.9997576887660913e-07, "loss": 0.9295, "step": 150 }, { "epoch": 0.9918433931484503, "grad_norm": 1.8694380521774292, "learning_rate": 3.334052105728458e-08, "loss": 0.9675, "step": 152 }, { "epoch": 0.9918433931484503, "eval_loss": 0.9894064664840698, "eval_runtime": 24.5775, "eval_samples_per_second": 19.896, "eval_steps_per_second": 2.523, "step": 152 } ], "logging_steps": 2, "max_steps": 153, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.85963932651946e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }