imdatta0's picture
End of training
8b1b653 verified
raw
history blame
22.2 kB
{
"best_metric": 0.9220191240310669,
"best_model_checkpoint": "/home/datta0/models/lora_final/Qwen2-7B_magiccoder_default/checkpoint-4",
"epoch": 0.99836867862969,
"eval_steps": 4,
"global_step": 153,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0065252854812398045,
"grad_norm": 6.940319538116455,
"learning_rate": 7.5e-05,
"loss": 0.8051,
"step": 1
},
{
"epoch": 0.013050570962479609,
"grad_norm": 5.597632884979248,
"learning_rate": 0.00015,
"loss": 0.9302,
"step": 2
},
{
"epoch": 0.026101141924959218,
"grad_norm": 4.383840084075928,
"learning_rate": 0.0003,
"loss": 0.8215,
"step": 4
},
{
"epoch": 0.026101141924959218,
"eval_loss": 0.9220191240310669,
"eval_runtime": 24.7526,
"eval_samples_per_second": 19.756,
"eval_steps_per_second": 2.505,
"step": 4
},
{
"epoch": 0.03915171288743882,
"grad_norm": 3.2939515113830566,
"learning_rate": 0.00029986665273697545,
"loss": 0.8629,
"step": 6
},
{
"epoch": 0.052202283849918436,
"grad_norm": 49.209835052490234,
"learning_rate": 0.0002994668480344693,
"loss": 0.9247,
"step": 8
},
{
"epoch": 0.052202283849918436,
"eval_loss": 0.9779874086380005,
"eval_runtime": 24.7429,
"eval_samples_per_second": 19.763,
"eval_steps_per_second": 2.506,
"step": 8
},
{
"epoch": 0.06525285481239804,
"grad_norm": 2.9049675464630127,
"learning_rate": 0.0002988012967306524,
"loss": 0.9425,
"step": 10
},
{
"epoch": 0.07830342577487764,
"grad_norm": 2.8929295539855957,
"learning_rate": 0.000297871182151455,
"loss": 0.9611,
"step": 12
},
{
"epoch": 0.07830342577487764,
"eval_loss": 0.9693307876586914,
"eval_runtime": 24.7094,
"eval_samples_per_second": 19.79,
"eval_steps_per_second": 2.509,
"step": 12
},
{
"epoch": 0.09135399673735727,
"grad_norm": 2.5417490005493164,
"learning_rate": 0.00029667815800665635,
"loss": 0.9792,
"step": 14
},
{
"epoch": 0.10440456769983687,
"grad_norm": 2.707855224609375,
"learning_rate": 0.0002952243454496488,
"loss": 0.9392,
"step": 16
},
{
"epoch": 0.10440456769983687,
"eval_loss": 0.9866985082626343,
"eval_runtime": 24.6246,
"eval_samples_per_second": 19.858,
"eval_steps_per_second": 2.518,
"step": 16
},
{
"epoch": 0.11745513866231648,
"grad_norm": 7.373922348022461,
"learning_rate": 0.0002935123293061047,
"loss": 0.9393,
"step": 18
},
{
"epoch": 0.13050570962479607,
"grad_norm": 2.5633223056793213,
"learning_rate": 0.0002915451534782506,
"loss": 1.0135,
"step": 20
},
{
"epoch": 0.13050570962479607,
"eval_loss": 1.0108040571212769,
"eval_runtime": 24.6129,
"eval_samples_per_second": 19.868,
"eval_steps_per_second": 2.519,
"step": 20
},
{
"epoch": 0.14355628058727568,
"grad_norm": 2.2816696166992188,
"learning_rate": 0.0002893263155329204,
"loss": 1.0003,
"step": 22
},
{
"epoch": 0.1566068515497553,
"grad_norm": 2.6352310180664062,
"learning_rate": 0.00028685976048300875,
"loss": 0.9152,
"step": 24
},
{
"epoch": 0.1566068515497553,
"eval_loss": 1.0166871547698975,
"eval_runtime": 24.4896,
"eval_samples_per_second": 19.968,
"eval_steps_per_second": 2.532,
"step": 24
},
{
"epoch": 0.16965742251223492,
"grad_norm": 2.428823232650757,
"learning_rate": 0.00028414987377338235,
"loss": 1.0468,
"step": 26
},
{
"epoch": 0.18270799347471453,
"grad_norm": 2.590581178665161,
"learning_rate": 0.0002812014734837191,
"loss": 0.9298,
"step": 28
},
{
"epoch": 0.18270799347471453,
"eval_loss": 1.0250943899154663,
"eval_runtime": 54.7083,
"eval_samples_per_second": 8.938,
"eval_steps_per_second": 1.133,
"step": 28
},
{
"epoch": 0.19575856443719414,
"grad_norm": 2.5896878242492676,
"learning_rate": 0.0002780198017621379,
"loss": 1.0095,
"step": 30
},
{
"epoch": 0.20880913539967375,
"grad_norm": 2.394001007080078,
"learning_rate": 0.00027461051550485116,
"loss": 1.0625,
"step": 32
},
{
"epoch": 0.20880913539967375,
"eval_loss": 1.0349429845809937,
"eval_runtime": 54.8183,
"eval_samples_per_second": 8.92,
"eval_steps_per_second": 1.131,
"step": 32
},
{
"epoch": 0.22185970636215335,
"grad_norm": 2.3402562141418457,
"learning_rate": 0.00027097967629840906,
"loss": 0.9817,
"step": 34
},
{
"epoch": 0.23491027732463296,
"grad_norm": 2.0935347080230713,
"learning_rate": 0.0002671337396424204,
"loss": 0.9695,
"step": 36
},
{
"epoch": 0.23491027732463296,
"eval_loss": 1.0332014560699463,
"eval_runtime": 55.2086,
"eval_samples_per_second": 8.857,
"eval_steps_per_second": 1.123,
"step": 36
},
{
"epoch": 0.24796084828711257,
"grad_norm": 1.9977389574050903,
"learning_rate": 0.00026307954347190983,
"loss": 0.9429,
"step": 38
},
{
"epoch": 0.26101141924959215,
"grad_norm": 2.104321241378784,
"learning_rate": 0.00025882429599971866,
"loss": 1.0104,
"step": 40
},
{
"epoch": 0.26101141924959215,
"eval_loss": 1.0390156507492065,
"eval_runtime": 55.2819,
"eval_samples_per_second": 8.846,
"eval_steps_per_second": 1.122,
"step": 40
},
{
"epoch": 0.2740619902120718,
"grad_norm": 1.943311095237732,
"learning_rate": 0.0002543755629005657,
"loss": 0.9952,
"step": 42
},
{
"epoch": 0.28711256117455136,
"grad_norm": 2.2244155406951904,
"learning_rate": 0.0002497412538595537,
"loss": 1.0721,
"step": 44
},
{
"epoch": 0.28711256117455136,
"eval_loss": 1.0405514240264893,
"eval_runtime": 55.1226,
"eval_samples_per_second": 8.871,
"eval_steps_per_second": 1.125,
"step": 44
},
{
"epoch": 0.300163132137031,
"grad_norm": 2.213677406311035,
"learning_rate": 0.00024492960850903755,
"loss": 0.9997,
"step": 46
},
{
"epoch": 0.3132137030995106,
"grad_norm": 2.108431100845337,
"learning_rate": 0.00023994918177885902,
"loss": 1.0397,
"step": 48
},
{
"epoch": 0.3132137030995106,
"eval_loss": 1.0448977947235107,
"eval_runtime": 55.076,
"eval_samples_per_second": 8.879,
"eval_steps_per_second": 1.126,
"step": 48
},
{
"epoch": 0.3262642740619902,
"grad_norm": 2.3814570903778076,
"learning_rate": 0.0002348088286859938,
"loss": 1.0839,
"step": 50
},
{
"epoch": 0.33931484502446985,
"grad_norm": 2.261181116104126,
"learning_rate": 0.00022951768859065402,
"loss": 0.9623,
"step": 52
},
{
"epoch": 0.33931484502446985,
"eval_loss": 1.0447765588760376,
"eval_runtime": 55.341,
"eval_samples_per_second": 8.836,
"eval_steps_per_second": 1.12,
"step": 52
},
{
"epoch": 0.3523654159869494,
"grad_norm": 2.159951686859131,
"learning_rate": 0.0002240851689468395,
"loss": 0.9753,
"step": 54
},
{
"epoch": 0.36541598694942906,
"grad_norm": 2.21645188331604,
"learning_rate": 0.00021852092857622808,
"loss": 0.9735,
"step": 56
},
{
"epoch": 0.36541598694942906,
"eval_loss": 1.0435727834701538,
"eval_runtime": 54.9591,
"eval_samples_per_second": 8.898,
"eval_steps_per_second": 1.128,
"step": 56
},
{
"epoch": 0.37846655791190864,
"grad_norm": 2.0440618991851807,
"learning_rate": 0.00021283486049514277,
"loss": 1.051,
"step": 58
},
{
"epoch": 0.3915171288743883,
"grad_norm": 2.2410900592803955,
"learning_rate": 0.00020703707432513004,
"loss": 1.0016,
"step": 60
},
{
"epoch": 0.3915171288743883,
"eval_loss": 1.043070673942566,
"eval_runtime": 55.0676,
"eval_samples_per_second": 8.88,
"eval_steps_per_second": 1.126,
"step": 60
},
{
"epoch": 0.40456769983686786,
"grad_norm": 2.09979248046875,
"learning_rate": 0.00020113787831842152,
"loss": 0.9375,
"step": 62
},
{
"epoch": 0.4176182707993475,
"grad_norm": 2.2402355670928955,
"learning_rate": 0.0001951477610302378,
"loss": 1.0557,
"step": 64
},
{
"epoch": 0.4176182707993475,
"eval_loss": 1.0401309728622437,
"eval_runtime": 55.6078,
"eval_samples_per_second": 8.794,
"eval_steps_per_second": 1.115,
"step": 64
},
{
"epoch": 0.43066884176182707,
"grad_norm": 2.0318586826324463,
"learning_rate": 0.0001890773726705198,
"loss": 1.0214,
"step": 66
},
{
"epoch": 0.4437194127243067,
"grad_norm": 2.138606309890747,
"learning_rate": 0.00018293750616824443,
"loss": 1.0377,
"step": 68
},
{
"epoch": 0.4437194127243067,
"eval_loss": 1.0372790098190308,
"eval_runtime": 24.7257,
"eval_samples_per_second": 19.777,
"eval_steps_per_second": 2.508,
"step": 68
},
{
"epoch": 0.4567699836867863,
"grad_norm": 1.7595700025558472,
"learning_rate": 0.00017673907798199052,
"loss": 1.0546,
"step": 70
},
{
"epoch": 0.4698205546492659,
"grad_norm": 1.987815022468567,
"learning_rate": 0.000170493108690874,
"loss": 1.0022,
"step": 72
},
{
"epoch": 0.4698205546492659,
"eval_loss": 1.0360997915267944,
"eval_runtime": 24.7474,
"eval_samples_per_second": 19.76,
"eval_steps_per_second": 2.505,
"step": 72
},
{
"epoch": 0.4828711256117455,
"grad_norm": 1.9406994581222534,
"learning_rate": 0.00016421070340036023,
"loss": 1.0372,
"step": 74
},
{
"epoch": 0.49592169657422513,
"grad_norm": 2.0799319744110107,
"learning_rate": 0.00015790303199779193,
"loss": 1.0193,
"step": 76
},
{
"epoch": 0.49592169657422513,
"eval_loss": 1.0328214168548584,
"eval_runtime": 24.7131,
"eval_samples_per_second": 19.787,
"eval_steps_per_second": 2.509,
"step": 76
},
{
"epoch": 0.5089722675367048,
"grad_norm": 2.057676315307617,
"learning_rate": 0.00015158130929273695,
"loss": 0.9597,
"step": 78
},
{
"epoch": 0.5220228384991843,
"grad_norm": 2.00854754447937,
"learning_rate": 0.00014525677507746615,
"loss": 0.9806,
"step": 80
},
{
"epoch": 0.5220228384991843,
"eval_loss": 1.0301356315612793,
"eval_runtime": 24.7052,
"eval_samples_per_second": 19.793,
"eval_steps_per_second": 2.51,
"step": 80
},
{
"epoch": 0.5350734094616639,
"grad_norm": 1.9844895601272583,
"learning_rate": 0.00013894067414301314,
"loss": 1.068,
"step": 82
},
{
"epoch": 0.5481239804241436,
"grad_norm": 1.9492027759552002,
"learning_rate": 0.0001326442362863458,
"loss": 1.0542,
"step": 84
},
{
"epoch": 0.5481239804241436,
"eval_loss": 1.0262655019760132,
"eval_runtime": 24.6275,
"eval_samples_per_second": 19.856,
"eval_steps_per_second": 2.518,
"step": 84
},
{
"epoch": 0.5611745513866232,
"grad_norm": 1.8868807554244995,
"learning_rate": 0.00012637865634419735,
"loss": 1.0136,
"step": 86
},
{
"epoch": 0.5742251223491027,
"grad_norm": 1.9024137258529663,
"learning_rate": 0.00012015507428905507,
"loss": 0.9692,
"step": 88
},
{
"epoch": 0.5742251223491027,
"eval_loss": 1.024366021156311,
"eval_runtime": 24.5294,
"eval_samples_per_second": 19.935,
"eval_steps_per_second": 2.528,
"step": 88
},
{
"epoch": 0.5872756933115824,
"grad_norm": 2.2372443675994873,
"learning_rate": 0.00011398455542269575,
"loss": 0.9305,
"step": 90
},
{
"epoch": 0.600326264274062,
"grad_norm": 1.8708783388137817,
"learning_rate": 0.00010787807070248305,
"loss": 1.0464,
"step": 92
},
{
"epoch": 0.600326264274062,
"eval_loss": 1.0215392112731934,
"eval_runtime": 24.4525,
"eval_samples_per_second": 19.998,
"eval_steps_per_second": 2.536,
"step": 92
},
{
"epoch": 0.6133768352365416,
"grad_norm": 2.0300116539001465,
"learning_rate": 0.00010184647723540557,
"loss": 0.9709,
"step": 94
},
{
"epoch": 0.6264274061990212,
"grad_norm": 2.0198493003845215,
"learning_rate": 9.590049897453668e-05,
"loss": 0.9771,
"step": 96
},
{
"epoch": 0.6264274061990212,
"eval_loss": 1.01658034324646,
"eval_runtime": 53.8396,
"eval_samples_per_second": 9.083,
"eval_steps_per_second": 1.152,
"step": 96
},
{
"epoch": 0.6394779771615008,
"grad_norm": 1.8200911283493042,
"learning_rate": 9.005070765223768e-05,
"loss": 1.0565,
"step": 98
},
{
"epoch": 0.6525285481239804,
"grad_norm": 2.173635721206665,
"learning_rate": 8.430750398400308e-05,
"loss": 1.0659,
"step": 100
},
{
"epoch": 0.6525285481239804,
"eval_loss": 1.0145906209945679,
"eval_runtime": 55.4651,
"eval_samples_per_second": 8.816,
"eval_steps_per_second": 1.118,
"step": 100
},
{
"epoch": 0.6655791190864601,
"grad_norm": 1.9142309427261353,
"learning_rate": 7.868109917636821e-05,
"loss": 0.9761,
"step": 102
},
{
"epoch": 0.6786296900489397,
"grad_norm": 1.9679898023605347,
"learning_rate": 7.318149677175675e-05,
"loss": 0.9476,
"step": 104
},
{
"epoch": 0.6786296900489397,
"eval_loss": 1.0106278657913208,
"eval_runtime": 55.6719,
"eval_samples_per_second": 8.784,
"eval_steps_per_second": 1.114,
"step": 104
},
{
"epoch": 0.6916802610114192,
"grad_norm": 1.9258702993392944,
"learning_rate": 6.781847486254697e-05,
"loss": 0.963,
"step": 106
},
{
"epoch": 0.7047308319738989,
"grad_norm": 2.029904842376709,
"learning_rate": 6.260156870598071e-05,
"loss": 0.983,
"step": 108
},
{
"epoch": 0.7047308319738989,
"eval_loss": 1.0074269771575928,
"eval_runtime": 57.0045,
"eval_samples_per_second": 8.578,
"eval_steps_per_second": 1.088,
"step": 108
},
{
"epoch": 0.7177814029363785,
"grad_norm": 1.779940128326416,
"learning_rate": 5.7540053770823644e-05,
"loss": 0.9698,
"step": 110
},
{
"epoch": 0.7308319738988581,
"grad_norm": 2.0144851207733154,
"learning_rate": 5.264292924592073e-05,
"loss": 0.9585,
"step": 112
},
{
"epoch": 0.7308319738988581,
"eval_loss": 1.0034711360931396,
"eval_runtime": 57.5133,
"eval_samples_per_second": 8.502,
"eval_steps_per_second": 1.078,
"step": 112
},
{
"epoch": 0.7438825448613376,
"grad_norm": 1.9726147651672363,
"learning_rate": 4.791890203996634e-05,
"loss": 0.9865,
"step": 114
},
{
"epoch": 0.7569331158238173,
"grad_norm": 1.7042125463485718,
"learning_rate": 4.3376371300938786e-05,
"loss": 0.9193,
"step": 116
},
{
"epoch": 0.7569331158238173,
"eval_loss": 0.9996815323829651,
"eval_runtime": 57.7466,
"eval_samples_per_second": 8.468,
"eval_steps_per_second": 1.074,
"step": 116
},
{
"epoch": 0.7699836867862969,
"grad_norm": 1.8329825401306152,
"learning_rate": 3.9023413482721426e-05,
"loss": 0.9742,
"step": 118
},
{
"epoch": 0.7830342577487766,
"grad_norm": 1.861943006515503,
"learning_rate": 3.4867767985462507e-05,
"loss": 0.9041,
"step": 120
},
{
"epoch": 0.7830342577487766,
"eval_loss": 0.9974753260612488,
"eval_runtime": 55.1354,
"eval_samples_per_second": 8.869,
"eval_steps_per_second": 1.125,
"step": 120
},
{
"epoch": 0.7960848287112561,
"grad_norm": 1.8173584938049316,
"learning_rate": 3.09168233952042e-05,
"loss": 1.026,
"step": 122
},
{
"epoch": 0.8091353996737357,
"grad_norm": 1.79753839969635,
"learning_rate": 2.717760434724613e-05,
"loss": 0.9697,
"step": 124
},
{
"epoch": 0.8091353996737357,
"eval_loss": 0.9954367876052856,
"eval_runtime": 55.5077,
"eval_samples_per_second": 8.81,
"eval_steps_per_second": 1.117,
"step": 124
},
{
"epoch": 0.8221859706362153,
"grad_norm": 1.7292028665542603,
"learning_rate": 2.3656759036600187e-05,
"loss": 0.9747,
"step": 126
},
{
"epoch": 0.835236541598695,
"grad_norm": 1.9664617776870728,
"learning_rate": 2.0360547397742523e-05,
"loss": 0.9464,
"step": 128
},
{
"epoch": 0.835236541598695,
"eval_loss": 0.9932743906974792,
"eval_runtime": 57.4493,
"eval_samples_per_second": 8.512,
"eval_steps_per_second": 1.079,
"step": 128
},
{
"epoch": 0.8482871125611745,
"grad_norm": 1.82283616065979,
"learning_rate": 1.7294829974678338e-05,
"loss": 0.9256,
"step": 130
},
{
"epoch": 0.8613376835236541,
"grad_norm": 1.9917670488357544,
"learning_rate": 1.4465057501108546e-05,
"loss": 1.0252,
"step": 132
},
{
"epoch": 0.8613376835236541,
"eval_loss": 0.9916940927505493,
"eval_runtime": 55.6062,
"eval_samples_per_second": 8.794,
"eval_steps_per_second": 1.115,
"step": 132
},
{
"epoch": 0.8743882544861338,
"grad_norm": 1.8504716157913208,
"learning_rate": 1.1876261209224314e-05,
"loss": 0.9374,
"step": 134
},
{
"epoch": 0.8874388254486134,
"grad_norm": 1.6590113639831543,
"learning_rate": 9.533043884359615e-06,
"loss": 0.9665,
"step": 136
},
{
"epoch": 0.8874388254486134,
"eval_loss": 0.9909241199493408,
"eval_runtime": 24.7544,
"eval_samples_per_second": 19.754,
"eval_steps_per_second": 2.505,
"step": 136
},
{
"epoch": 0.9004893964110929,
"grad_norm": 1.7258245944976807,
"learning_rate": 7.439571681407053e-06,
"loss": 1.0069,
"step": 138
},
{
"epoch": 0.9135399673735726,
"grad_norm": 1.87185537815094,
"learning_rate": 5.59956671754635e-06,
"loss": 0.9948,
"step": 140
},
{
"epoch": 0.9135399673735726,
"eval_loss": 0.9903515577316284,
"eval_runtime": 24.7378,
"eval_samples_per_second": 19.767,
"eval_steps_per_second": 2.506,
"step": 140
},
{
"epoch": 0.9265905383360522,
"grad_norm": 1.9415644407272339,
"learning_rate": 4.016300454455945e-06,
"loss": 1.0008,
"step": 142
},
{
"epoch": 0.9396411092985318,
"grad_norm": 1.9181973934173584,
"learning_rate": 2.692587881773478e-06,
"loss": 0.946,
"step": 144
},
{
"epoch": 0.9396411092985318,
"eval_loss": 0.9896851778030396,
"eval_runtime": 24.7297,
"eval_samples_per_second": 19.774,
"eval_steps_per_second": 2.507,
"step": 144
},
{
"epoch": 0.9526916802610114,
"grad_norm": 1.8300237655639648,
"learning_rate": 1.6307825121469164e-06,
"loss": 0.9866,
"step": 146
},
{
"epoch": 0.965742251223491,
"grad_norm": 1.893951177597046,
"learning_rate": 8.327721967749779e-07,
"loss": 1.0095,
"step": 148
},
{
"epoch": 0.965742251223491,
"eval_loss": 0.9895658493041992,
"eval_runtime": 24.6594,
"eval_samples_per_second": 19.83,
"eval_steps_per_second": 2.514,
"step": 148
},
{
"epoch": 0.9787928221859706,
"grad_norm": 1.895480990409851,
"learning_rate": 2.9997576887660913e-07,
"loss": 0.9295,
"step": 150
},
{
"epoch": 0.9918433931484503,
"grad_norm": 1.8694380521774292,
"learning_rate": 3.334052105728458e-08,
"loss": 0.9675,
"step": 152
},
{
"epoch": 0.9918433931484503,
"eval_loss": 0.9894064664840698,
"eval_runtime": 24.5775,
"eval_samples_per_second": 19.896,
"eval_steps_per_second": 2.523,
"step": 152
}
],
"logging_steps": 2,
"max_steps": 153,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 4,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.85963932651946e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}