{ "best_metric": 1.2696720361709595, "best_model_checkpoint": "/home/datta0/models/lora_final/Meta-Llama-3-8B_magiccoder_default/checkpoint-152", "epoch": 0.9975708502024292, "eval_steps": 4, "global_step": 154, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006477732793522267, "grad_norm": 7.975759983062744, "learning_rate": 7.5e-05, "loss": 1.3889, "step": 1 }, { "epoch": 0.012955465587044534, "grad_norm": 8.645442008972168, "learning_rate": 0.00015, "loss": 1.5051, "step": 2 }, { "epoch": 0.025910931174089068, "grad_norm": 8.133455276489258, "learning_rate": 0.0003, "loss": 1.2592, "step": 4 }, { "epoch": 0.025910931174089068, "eval_loss": 1.4263103008270264, "eval_runtime": 26.5522, "eval_samples_per_second": 18.643, "eval_steps_per_second": 2.335, "step": 4 }, { "epoch": 0.038866396761133605, "grad_norm": 15.554954528808594, "learning_rate": 0.00029986842451482874, "loss": 1.4659, "step": 6 }, { "epoch": 0.051821862348178135, "grad_norm": 11.2620849609375, "learning_rate": 0.0002994739288874256, "loss": 1.4281, "step": 8 }, { "epoch": 0.051821862348178135, "eval_loss": 1.4063184261322021, "eval_runtime": 26.5276, "eval_samples_per_second": 18.66, "eval_steps_per_second": 2.337, "step": 8 }, { "epoch": 0.06477732793522267, "grad_norm": 7.065455913543701, "learning_rate": 0.0002988172051971717, "loss": 1.3404, "step": 10 }, { "epoch": 0.07773279352226721, "grad_norm": 9.748723030090332, "learning_rate": 0.0002978994055605757, "loss": 1.3795, "step": 12 }, { "epoch": 0.07773279352226721, "eval_loss": 1.382373332977295, "eval_runtime": 26.5171, "eval_samples_per_second": 18.667, "eval_steps_per_second": 2.338, "step": 12 }, { "epoch": 0.09068825910931175, "grad_norm": 12.217620849609375, "learning_rate": 0.0002967221401100708, "loss": 1.4444, "step": 14 }, { "epoch": 0.10364372469635627, "grad_norm": 8.278894424438477, "learning_rate": 0.00029528747416929463, "loss": 1.3751, "step": 16 }, { "epoch": 0.10364372469635627, "eval_loss": 1.3937216997146606, "eval_runtime": 26.4559, "eval_samples_per_second": 18.71, "eval_steps_per_second": 2.344, "step": 16 }, { "epoch": 0.11659919028340081, "grad_norm": 14.924971580505371, "learning_rate": 0.00029359792462981004, "loss": 1.3393, "step": 18 }, { "epoch": 0.12955465587044535, "grad_norm": 7.198514461517334, "learning_rate": 0.00029165645553562214, "loss": 1.4053, "step": 20 }, { "epoch": 0.12955465587044535, "eval_loss": 1.3523073196411133, "eval_runtime": 26.4173, "eval_samples_per_second": 18.738, "eval_steps_per_second": 2.347, "step": 20 }, { "epoch": 0.14251012145748987, "grad_norm": 6.342569828033447, "learning_rate": 0.00028946647288323766, "loss": 1.3715, "step": 22 }, { "epoch": 0.15546558704453442, "grad_norm": 5.877941131591797, "learning_rate": 0.0002870318186463901, "loss": 1.2927, "step": 24 }, { "epoch": 0.15546558704453442, "eval_loss": 1.3473957777023315, "eval_runtime": 26.2462, "eval_samples_per_second": 18.86, "eval_steps_per_second": 2.362, "step": 24 }, { "epoch": 0.16842105263157894, "grad_norm": 4.236348628997803, "learning_rate": 0.0002843567640359119, "loss": 1.2866, "step": 26 }, { "epoch": 0.1813765182186235, "grad_norm": 8.370125770568848, "learning_rate": 0.0002814460020065795, "loss": 1.3619, "step": 28 }, { "epoch": 0.1813765182186235, "eval_loss": 1.3529084920883179, "eval_runtime": 58.5577, "eval_samples_per_second": 8.453, "eval_steps_per_second": 1.059, "step": 28 }, { "epoch": 0.19433198380566802, "grad_norm": 5.792184352874756, "learning_rate": 0.000278304639024076, "loss": 1.3633, "step": 30 }, { "epoch": 0.20728744939271254, "grad_norm": 12.429760932922363, "learning_rate": 0.00027493818610651487, "loss": 1.3533, "step": 32 }, { "epoch": 0.20728744939271254, "eval_loss": 1.3628934621810913, "eval_runtime": 58.1834, "eval_samples_per_second": 8.508, "eval_steps_per_second": 1.066, "step": 32 }, { "epoch": 0.2202429149797571, "grad_norm": 8.379545211791992, "learning_rate": 0.0002713525491562421, "loss": 1.4074, "step": 34 }, { "epoch": 0.23319838056680162, "grad_norm": 10.356365203857422, "learning_rate": 0.00026755401859887595, "loss": 1.3627, "step": 36 }, { "epoch": 0.23319838056680162, "eval_loss": 1.3635836839675903, "eval_runtime": 58.1828, "eval_samples_per_second": 8.508, "eval_steps_per_second": 1.066, "step": 36 }, { "epoch": 0.24615384615384617, "grad_norm": 7.12795352935791, "learning_rate": 0.00026354925834776345, "loss": 1.2767, "step": 38 }, { "epoch": 0.2591093117408907, "grad_norm": 7.771018981933594, "learning_rate": 0.0002593452941132117, "loss": 1.4408, "step": 40 }, { "epoch": 0.2591093117408907, "eval_loss": 1.3530882596969604, "eval_runtime": 58.4466, "eval_samples_per_second": 8.469, "eval_steps_per_second": 1.061, "step": 40 }, { "epoch": 0.2720647773279352, "grad_norm": 4.957456588745117, "learning_rate": 0.0002549495010770048, "loss": 1.3767, "step": 42 }, { "epoch": 0.28502024291497974, "grad_norm": 5.10299825668335, "learning_rate": 0.0002503695909538287, "loss": 1.3744, "step": 44 }, { "epoch": 0.28502024291497974, "eval_loss": 1.3395272493362427, "eval_runtime": 58.5125, "eval_samples_per_second": 8.46, "eval_steps_per_second": 1.06, "step": 44 }, { "epoch": 0.2979757085020243, "grad_norm": 5.048585891723633, "learning_rate": 0.0002456135984623034, "loss": 1.3295, "step": 46 }, { "epoch": 0.31093117408906884, "grad_norm": 3.550816059112549, "learning_rate": 0.00024068986722935624, "loss": 1.2658, "step": 48 }, { "epoch": 0.31093117408906884, "eval_loss": 1.3364324569702148, "eval_runtime": 58.5182, "eval_samples_per_second": 8.459, "eval_steps_per_second": 1.06, "step": 48 }, { "epoch": 0.32388663967611336, "grad_norm": 9.427572250366211, "learning_rate": 0.00023560703515266478, "loss": 1.383, "step": 50 }, { "epoch": 0.3368421052631579, "grad_norm": 6.952998638153076, "learning_rate": 0.00023037401924684946, "loss": 1.3364, "step": 52 }, { "epoch": 0.3368421052631579, "eval_loss": 1.340024709701538, "eval_runtime": 58.6238, "eval_samples_per_second": 8.444, "eval_steps_per_second": 1.058, "step": 52 }, { "epoch": 0.3497975708502024, "grad_norm": 3.620708465576172, "learning_rate": 0.000225, "loss": 1.299, "step": 54 }, { "epoch": 0.362753036437247, "grad_norm": 5.74739408493042, "learning_rate": 0.00021949440526797926, "loss": 1.3765, "step": 56 }, { "epoch": 0.362753036437247, "eval_loss": 1.3390815258026123, "eval_runtime": 58.1389, "eval_samples_per_second": 8.514, "eval_steps_per_second": 1.066, "step": 56 }, { "epoch": 0.3757085020242915, "grad_norm": 3.6690967082977295, "learning_rate": 0.00021386689373476087, "loss": 1.2833, "step": 58 }, { "epoch": 0.38866396761133604, "grad_norm": 4.156944274902344, "learning_rate": 0.00020812733796781542, "loss": 1.3427, "step": 60 }, { "epoch": 0.38866396761133604, "eval_loss": 1.3370171785354614, "eval_runtime": 58.6352, "eval_samples_per_second": 8.442, "eval_steps_per_second": 1.057, "step": 60 }, { "epoch": 0.40161943319838056, "grad_norm": 4.592350959777832, "learning_rate": 0.00020228580709827227, "loss": 1.3145, "step": 62 }, { "epoch": 0.4145748987854251, "grad_norm": 3.865915060043335, "learning_rate": 0.0001963525491562421, "loss": 1.3975, "step": 64 }, { "epoch": 0.4145748987854251, "eval_loss": 1.332935094833374, "eval_runtime": 58.8373, "eval_samples_per_second": 8.413, "eval_steps_per_second": 1.054, "step": 64 }, { "epoch": 0.42753036437246966, "grad_norm": 3.824434757232666, "learning_rate": 0.00019033797309228983, "loss": 1.3048, "step": 66 }, { "epoch": 0.4404858299595142, "grad_norm": 4.484032154083252, "learning_rate": 0.00018425263051659836, "loss": 1.2595, "step": 68 }, { "epoch": 0.4404858299595142, "eval_loss": 1.3324816226959229, "eval_runtime": 26.5744, "eval_samples_per_second": 18.627, "eval_steps_per_second": 2.333, "step": 68 }, { "epoch": 0.4534412955465587, "grad_norm": 5.382876873016357, "learning_rate": 0.0001781071971878587, "loss": 1.3215, "step": 70 }, { "epoch": 0.46639676113360323, "grad_norm": 5.109270095825195, "learning_rate": 0.00017191245428436173, "loss": 1.3291, "step": 72 }, { "epoch": 0.46639676113360323, "eval_loss": 1.3312028646469116, "eval_runtime": 26.5491, "eval_samples_per_second": 18.645, "eval_steps_per_second": 2.335, "step": 72 }, { "epoch": 0.47935222672064776, "grad_norm": 3.7885613441467285, "learning_rate": 0.000165679269490148, "loss": 1.2995, "step": 74 }, { "epoch": 0.49230769230769234, "grad_norm": 4.146183967590332, "learning_rate": 0.000159418577929397, "loss": 1.2702, "step": 76 }, { "epoch": 0.49230769230769234, "eval_loss": 1.3323029279708862, "eval_runtime": 26.5527, "eval_samples_per_second": 18.642, "eval_steps_per_second": 2.335, "step": 76 }, { "epoch": 0.5052631578947369, "grad_norm": 4.090492248535156, "learning_rate": 0.00015314136298250354, "loss": 1.3247, "step": 78 }, { "epoch": 0.5182186234817814, "grad_norm": 4.541670322418213, "learning_rate": 0.00014685863701749646, "loss": 1.3527, "step": 80 }, { "epoch": 0.5182186234817814, "eval_loss": 1.321340799331665, "eval_runtime": 26.4896, "eval_samples_per_second": 18.687, "eval_steps_per_second": 2.341, "step": 80 }, { "epoch": 0.5311740890688259, "grad_norm": 3.778151750564575, "learning_rate": 0.000140581422070603, "loss": 1.3419, "step": 82 }, { "epoch": 0.5441295546558704, "grad_norm": 3.073720693588257, "learning_rate": 0.000134320730509852, "loss": 1.2799, "step": 84 }, { "epoch": 0.5441295546558704, "eval_loss": 1.3154258728027344, "eval_runtime": 26.4775, "eval_samples_per_second": 18.695, "eval_steps_per_second": 2.342, "step": 84 }, { "epoch": 0.557085020242915, "grad_norm": 3.3625338077545166, "learning_rate": 0.00012808754571563827, "loss": 1.2982, "step": 86 }, { "epoch": 0.5700404858299595, "grad_norm": 3.183030605316162, "learning_rate": 0.00012189280281214126, "loss": 1.3082, "step": 88 }, { "epoch": 0.5700404858299595, "eval_loss": 1.30991792678833, "eval_runtime": 26.3464, "eval_samples_per_second": 18.788, "eval_steps_per_second": 2.353, "step": 88 }, { "epoch": 0.582995951417004, "grad_norm": 3.021327257156372, "learning_rate": 0.00011574736948340163, "loss": 1.2957, "step": 90 }, { "epoch": 0.5959514170040486, "grad_norm": 3.598100423812866, "learning_rate": 0.00010966202690771014, "loss": 1.4042, "step": 92 }, { "epoch": 0.5959514170040486, "eval_loss": 1.3089451789855957, "eval_runtime": 26.2013, "eval_samples_per_second": 18.892, "eval_steps_per_second": 2.366, "step": 92 }, { "epoch": 0.6089068825910932, "grad_norm": 3.076206922531128, "learning_rate": 0.0001036474508437579, "loss": 1.2443, "step": 94 }, { "epoch": 0.6218623481781377, "grad_norm": 3.216519832611084, "learning_rate": 9.771419290172773e-05, "loss": 1.2221, "step": 96 }, { "epoch": 0.6218623481781377, "eval_loss": 1.3048464059829712, "eval_runtime": 58.5931, "eval_samples_per_second": 8.448, "eval_steps_per_second": 1.058, "step": 96 }, { "epoch": 0.6348178137651822, "grad_norm": 3.831418514251709, "learning_rate": 9.187266203218456e-05, "loss": 1.3533, "step": 98 }, { "epoch": 0.6477732793522267, "grad_norm": 3.0627291202545166, "learning_rate": 8.613310626523909e-05, "loss": 1.3079, "step": 100 }, { "epoch": 0.6477732793522267, "eval_loss": 1.3017206192016602, "eval_runtime": 58.4051, "eval_samples_per_second": 8.475, "eval_steps_per_second": 1.062, "step": 100 }, { "epoch": 0.6607287449392713, "grad_norm": 3.7238402366638184, "learning_rate": 8.050559473202077e-05, "loss": 1.3298, "step": 102 }, { "epoch": 0.6736842105263158, "grad_norm": 3.6340792179107666, "learning_rate": 7.500000000000002e-05, "loss": 1.2165, "step": 104 }, { "epoch": 0.6736842105263158, "eval_loss": 1.2970125675201416, "eval_runtime": 58.3679, "eval_samples_per_second": 8.481, "eval_steps_per_second": 1.062, "step": 104 }, { "epoch": 0.6866396761133603, "grad_norm": 3.414825677871704, "learning_rate": 6.962598075315046e-05, "loss": 1.3263, "step": 106 }, { "epoch": 0.6995951417004048, "grad_norm": 3.9352622032165527, "learning_rate": 6.439296484733525e-05, "loss": 1.239, "step": 108 }, { "epoch": 0.6995951417004048, "eval_loss": 1.2941410541534424, "eval_runtime": 58.4064, "eval_samples_per_second": 8.475, "eval_steps_per_second": 1.062, "step": 108 }, { "epoch": 0.7125506072874493, "grad_norm": 2.6827166080474854, "learning_rate": 5.931013277064377e-05, "loss": 1.2469, "step": 110 }, { "epoch": 0.725506072874494, "grad_norm": 3.790194034576416, "learning_rate": 5.4386401537696536e-05, "loss": 1.2528, "step": 112 }, { "epoch": 0.725506072874494, "eval_loss": 1.2877105474472046, "eval_runtime": 58.6006, "eval_samples_per_second": 8.447, "eval_steps_per_second": 1.058, "step": 112 }, { "epoch": 0.7384615384615385, "grad_norm": 3.4227898120880127, "learning_rate": 4.963040904617131e-05, "loss": 1.2605, "step": 114 }, { "epoch": 0.751417004048583, "grad_norm": 3.921224355697632, "learning_rate": 4.5050498922995166e-05, "loss": 1.2932, "step": 116 }, { "epoch": 0.751417004048583, "eval_loss": 1.2858980894088745, "eval_runtime": 58.8103, "eval_samples_per_second": 8.417, "eval_steps_per_second": 1.054, "step": 116 }, { "epoch": 0.7643724696356275, "grad_norm": 3.2218077182769775, "learning_rate": 4.06547058867883e-05, "loss": 1.2063, "step": 118 }, { "epoch": 0.7773279352226721, "grad_norm": 2.803480863571167, "learning_rate": 3.645074165223655e-05, "loss": 1.2762, "step": 120 }, { "epoch": 0.7773279352226721, "eval_loss": 1.2803733348846436, "eval_runtime": 58.4885, "eval_samples_per_second": 8.463, "eval_steps_per_second": 1.06, "step": 120 }, { "epoch": 0.7902834008097166, "grad_norm": 2.5244181156158447, "learning_rate": 3.2445981401124035e-05, "loss": 1.2421, "step": 122 }, { "epoch": 0.8032388663967611, "grad_norm": 3.004887819290161, "learning_rate": 2.8647450843757897e-05, "loss": 1.2914, "step": 124 }, { "epoch": 0.8032388663967611, "eval_loss": 1.2790910005569458, "eval_runtime": 58.6109, "eval_samples_per_second": 8.446, "eval_steps_per_second": 1.058, "step": 124 }, { "epoch": 0.8161943319838056, "grad_norm": 3.6300973892211914, "learning_rate": 2.5061813893485085e-05, "loss": 1.3242, "step": 126 }, { "epoch": 0.8291497975708502, "grad_norm": 2.6950979232788086, "learning_rate": 2.169536097592401e-05, "loss": 1.2835, "step": 128 }, { "epoch": 0.8291497975708502, "eval_loss": 1.2755361795425415, "eval_runtime": 58.9713, "eval_samples_per_second": 8.394, "eval_steps_per_second": 1.051, "step": 128 }, { "epoch": 0.8421052631578947, "grad_norm": 3.0187506675720215, "learning_rate": 1.8553997993420495e-05, "loss": 1.2856, "step": 130 }, { "epoch": 0.8550607287449393, "grad_norm": 2.8952746391296387, "learning_rate": 1.5643235964088064e-05, "loss": 1.2735, "step": 132 }, { "epoch": 0.8550607287449393, "eval_loss": 1.2730661630630493, "eval_runtime": 58.4909, "eval_samples_per_second": 8.463, "eval_steps_per_second": 1.06, "step": 132 }, { "epoch": 0.8680161943319838, "grad_norm": 2.607783555984497, "learning_rate": 1.2968181353609852e-05, "loss": 1.265, "step": 134 }, { "epoch": 0.8809716599190284, "grad_norm": 3.7643980979919434, "learning_rate": 1.0533527116762296e-05, "loss": 1.2264, "step": 136 }, { "epoch": 0.8809716599190284, "eval_loss": 1.272207498550415, "eval_runtime": 26.5562, "eval_samples_per_second": 18.64, "eval_steps_per_second": 2.335, "step": 136 }, { "epoch": 0.8939271255060729, "grad_norm": 2.4194986820220947, "learning_rate": 8.343544464377849e-06, "loss": 1.2562, "step": 138 }, { "epoch": 0.9068825910931174, "grad_norm": 2.965425491333008, "learning_rate": 6.402075370189913e-06, "loss": 1.2637, "step": 140 }, { "epoch": 0.9068825910931174, "eval_loss": 1.271282434463501, "eval_runtime": 26.5644, "eval_samples_per_second": 18.634, "eval_steps_per_second": 2.334, "step": 140 }, { "epoch": 0.9198380566801619, "grad_norm": 2.7396891117095947, "learning_rate": 4.712525830705338e-06, "loss": 1.2302, "step": 142 }, { "epoch": 0.9327935222672065, "grad_norm": 2.365996837615967, "learning_rate": 3.2778598899291465e-06, "loss": 1.2133, "step": 144 }, { "epoch": 0.9327935222672065, "eval_loss": 1.2704302072525024, "eval_runtime": 26.54, "eval_samples_per_second": 18.651, "eval_steps_per_second": 2.336, "step": 144 }, { "epoch": 0.945748987854251, "grad_norm": 2.8826277256011963, "learning_rate": 2.100594439424269e-06, "loss": 1.2561, "step": 146 }, { "epoch": 0.9587044534412955, "grad_norm": 2.938122034072876, "learning_rate": 1.1827948028283352e-06, "loss": 1.2379, "step": 148 }, { "epoch": 0.9587044534412955, "eval_loss": 1.2698720693588257, "eval_runtime": 26.5066, "eval_samples_per_second": 18.675, "eval_steps_per_second": 2.339, "step": 148 }, { "epoch": 0.97165991902834, "grad_norm": 3.354945659637451, "learning_rate": 5.260711125743444e-07, "loss": 1.216, "step": 150 }, { "epoch": 0.9846153846153847, "grad_norm": 2.604530096054077, "learning_rate": 1.315754851712425e-07, "loss": 1.2131, "step": 152 }, { "epoch": 0.9846153846153847, "eval_loss": 1.2696720361709595, "eval_runtime": 26.4168, "eval_samples_per_second": 18.738, "eval_steps_per_second": 2.347, "step": 152 }, { "epoch": 0.9975708502024292, "grad_norm": 2.729032278060913, "learning_rate": 0.0, "loss": 1.2418, "step": 154 } ], "logging_steps": 2, "max_steps": 154, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.1926442566916506e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }