imdatta0's picture
End of training
8b1b653 verified
raw
history blame
22.2 kB
{
"best_metric": 0.9388719201087952,
"best_model_checkpoint": "/home/datta0/models/lora_final/Qwen2-7B_magiccoder_ortho/checkpoint-12",
"epoch": 0.99836867862969,
"eval_steps": 4,
"global_step": 153,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0065252854812398045,
"grad_norm": 4.3915696144104,
"learning_rate": 7.5e-05,
"loss": 0.8061,
"step": 1
},
{
"epoch": 0.013050570962479609,
"grad_norm": 3.7475483417510986,
"learning_rate": 0.00015,
"loss": 0.931,
"step": 2
},
{
"epoch": 0.026101141924959218,
"grad_norm": 5.367969989776611,
"learning_rate": 0.0003,
"loss": 0.8262,
"step": 4
},
{
"epoch": 0.026101141924959218,
"eval_loss": 1.0883674621582031,
"eval_runtime": 24.5353,
"eval_samples_per_second": 19.931,
"eval_steps_per_second": 2.527,
"step": 4
},
{
"epoch": 0.03915171288743882,
"grad_norm": 11.019035339355469,
"learning_rate": 0.00029986665273697545,
"loss": 1.0003,
"step": 6
},
{
"epoch": 0.052202283849918436,
"grad_norm": 2.1015965938568115,
"learning_rate": 0.0002994668480344693,
"loss": 0.9776,
"step": 8
},
{
"epoch": 0.052202283849918436,
"eval_loss": 0.9662861227989197,
"eval_runtime": 24.6827,
"eval_samples_per_second": 19.811,
"eval_steps_per_second": 2.512,
"step": 8
},
{
"epoch": 0.06525285481239804,
"grad_norm": 1.750746726989746,
"learning_rate": 0.0002988012967306524,
"loss": 0.9319,
"step": 10
},
{
"epoch": 0.07830342577487764,
"grad_norm": 2.2127320766448975,
"learning_rate": 0.000297871182151455,
"loss": 0.9345,
"step": 12
},
{
"epoch": 0.07830342577487764,
"eval_loss": 0.9388719201087952,
"eval_runtime": 24.6723,
"eval_samples_per_second": 19.82,
"eval_steps_per_second": 2.513,
"step": 12
},
{
"epoch": 0.09135399673735727,
"grad_norm": 1.6279808282852173,
"learning_rate": 0.00029667815800665635,
"loss": 0.9489,
"step": 14
},
{
"epoch": 0.10440456769983687,
"grad_norm": 1.5321425199508667,
"learning_rate": 0.0002952243454496488,
"loss": 0.9026,
"step": 16
},
{
"epoch": 0.10440456769983687,
"eval_loss": 0.9481999278068542,
"eval_runtime": 24.6231,
"eval_samples_per_second": 19.859,
"eval_steps_per_second": 2.518,
"step": 16
},
{
"epoch": 0.11745513866231648,
"grad_norm": 1.355367660522461,
"learning_rate": 0.0002935123293061047,
"loss": 0.9004,
"step": 18
},
{
"epoch": 0.13050570962479607,
"grad_norm": 1.5347142219543457,
"learning_rate": 0.0002915451534782506,
"loss": 0.9618,
"step": 20
},
{
"epoch": 0.13050570962479607,
"eval_loss": 0.957125186920166,
"eval_runtime": 24.5628,
"eval_samples_per_second": 19.908,
"eval_steps_per_second": 2.524,
"step": 20
},
{
"epoch": 0.14355628058727568,
"grad_norm": 1.5545670986175537,
"learning_rate": 0.0002893263155329204,
"loss": 0.9457,
"step": 22
},
{
"epoch": 0.1566068515497553,
"grad_norm": 1.4520829916000366,
"learning_rate": 0.00028685976048300875,
"loss": 0.8685,
"step": 24
},
{
"epoch": 0.1566068515497553,
"eval_loss": 0.9719114899635315,
"eval_runtime": 24.4752,
"eval_samples_per_second": 19.979,
"eval_steps_per_second": 2.533,
"step": 24
},
{
"epoch": 0.16965742251223492,
"grad_norm": 1.3440358638763428,
"learning_rate": 0.00028414987377338235,
"loss": 1.0084,
"step": 26
},
{
"epoch": 0.18270799347471453,
"grad_norm": 1.43779718875885,
"learning_rate": 0.0002812014734837191,
"loss": 0.8834,
"step": 28
},
{
"epoch": 0.18270799347471453,
"eval_loss": 0.9751714468002319,
"eval_runtime": 56.0537,
"eval_samples_per_second": 8.724,
"eval_steps_per_second": 1.106,
"step": 28
},
{
"epoch": 0.19575856443719414,
"grad_norm": 1.4336111545562744,
"learning_rate": 0.0002780198017621379,
"loss": 0.9617,
"step": 30
},
{
"epoch": 0.20880913539967375,
"grad_norm": 1.291685700416565,
"learning_rate": 0.00027461051550485116,
"loss": 1.0185,
"step": 32
},
{
"epoch": 0.20880913539967375,
"eval_loss": 0.987638533115387,
"eval_runtime": 59.7835,
"eval_samples_per_second": 8.18,
"eval_steps_per_second": 1.037,
"step": 32
},
{
"epoch": 0.22185970636215335,
"grad_norm": 1.3673038482666016,
"learning_rate": 0.00027097967629840906,
"loss": 0.9289,
"step": 34
},
{
"epoch": 0.23491027732463296,
"grad_norm": 1.321115255355835,
"learning_rate": 0.0002671337396424204,
"loss": 0.9354,
"step": 36
},
{
"epoch": 0.23491027732463296,
"eval_loss": 0.9922739863395691,
"eval_runtime": 58.5264,
"eval_samples_per_second": 8.355,
"eval_steps_per_second": 1.059,
"step": 36
},
{
"epoch": 0.24796084828711257,
"grad_norm": 1.3209092617034912,
"learning_rate": 0.00026307954347190983,
"loss": 0.9003,
"step": 38
},
{
"epoch": 0.26101141924959215,
"grad_norm": 1.2740062475204468,
"learning_rate": 0.00025882429599971866,
"loss": 0.9734,
"step": 40
},
{
"epoch": 0.26101141924959215,
"eval_loss": 0.9982444047927856,
"eval_runtime": 56.4488,
"eval_samples_per_second": 8.663,
"eval_steps_per_second": 1.098,
"step": 40
},
{
"epoch": 0.2740619902120718,
"grad_norm": 1.3060563802719116,
"learning_rate": 0.0002543755629005657,
"loss": 0.9583,
"step": 42
},
{
"epoch": 0.28711256117455136,
"grad_norm": 1.2693545818328857,
"learning_rate": 0.0002497412538595537,
"loss": 1.034,
"step": 44
},
{
"epoch": 0.28711256117455136,
"eval_loss": 1.0034517049789429,
"eval_runtime": 57.2987,
"eval_samples_per_second": 8.534,
"eval_steps_per_second": 1.082,
"step": 44
},
{
"epoch": 0.300163132137031,
"grad_norm": 1.3035016059875488,
"learning_rate": 0.00024492960850903755,
"loss": 0.9648,
"step": 46
},
{
"epoch": 0.3132137030995106,
"grad_norm": 1.4393730163574219,
"learning_rate": 0.00023994918177885902,
"loss": 1.0067,
"step": 48
},
{
"epoch": 0.3132137030995106,
"eval_loss": 1.0048160552978516,
"eval_runtime": 56.3331,
"eval_samples_per_second": 8.681,
"eval_steps_per_second": 1.101,
"step": 48
},
{
"epoch": 0.3262642740619902,
"grad_norm": 1.5344454050064087,
"learning_rate": 0.0002348088286859938,
"loss": 1.0498,
"step": 50
},
{
"epoch": 0.33931484502446985,
"grad_norm": 1.2956377267837524,
"learning_rate": 0.00022951768859065402,
"loss": 0.932,
"step": 52
},
{
"epoch": 0.33931484502446985,
"eval_loss": 1.00808584690094,
"eval_runtime": 56.5841,
"eval_samples_per_second": 8.642,
"eval_steps_per_second": 1.096,
"step": 52
},
{
"epoch": 0.3523654159869494,
"grad_norm": 1.5058661699295044,
"learning_rate": 0.0002240851689468395,
"loss": 0.9455,
"step": 54
},
{
"epoch": 0.36541598694942906,
"grad_norm": 1.3148020505905151,
"learning_rate": 0.00021852092857622808,
"loss": 0.9407,
"step": 56
},
{
"epoch": 0.36541598694942906,
"eval_loss": 1.006118655204773,
"eval_runtime": 57.5921,
"eval_samples_per_second": 8.491,
"eval_steps_per_second": 1.077,
"step": 56
},
{
"epoch": 0.37846655791190864,
"grad_norm": 1.1989065408706665,
"learning_rate": 0.00021283486049514277,
"loss": 1.023,
"step": 58
},
{
"epoch": 0.3915171288743883,
"grad_norm": 1.3775067329406738,
"learning_rate": 0.00020703707432513004,
"loss": 0.9682,
"step": 60
},
{
"epoch": 0.3915171288743883,
"eval_loss": 1.0053811073303223,
"eval_runtime": 57.2201,
"eval_samples_per_second": 8.546,
"eval_steps_per_second": 1.084,
"step": 60
},
{
"epoch": 0.40456769983686786,
"grad_norm": 1.320212960243225,
"learning_rate": 0.00020113787831842152,
"loss": 0.8986,
"step": 62
},
{
"epoch": 0.4176182707993475,
"grad_norm": 1.325500726699829,
"learning_rate": 0.0001951477610302378,
"loss": 1.0224,
"step": 64
},
{
"epoch": 0.4176182707993475,
"eval_loss": 1.0092753171920776,
"eval_runtime": 56.0548,
"eval_samples_per_second": 8.724,
"eval_steps_per_second": 1.106,
"step": 64
},
{
"epoch": 0.43066884176182707,
"grad_norm": 1.2880396842956543,
"learning_rate": 0.0001890773726705198,
"loss": 0.9943,
"step": 66
},
{
"epoch": 0.4437194127243067,
"grad_norm": 1.237645149230957,
"learning_rate": 0.00018293750616824443,
"loss": 1.0145,
"step": 68
},
{
"epoch": 0.4437194127243067,
"eval_loss": 1.009407639503479,
"eval_runtime": 24.7188,
"eval_samples_per_second": 19.783,
"eval_steps_per_second": 2.508,
"step": 68
},
{
"epoch": 0.4567699836867863,
"grad_norm": 1.1990931034088135,
"learning_rate": 0.00017673907798199052,
"loss": 1.0333,
"step": 70
},
{
"epoch": 0.4698205546492659,
"grad_norm": 1.2862218618392944,
"learning_rate": 0.000170493108690874,
"loss": 0.9756,
"step": 72
},
{
"epoch": 0.4698205546492659,
"eval_loss": 1.010068416595459,
"eval_runtime": 24.7249,
"eval_samples_per_second": 19.778,
"eval_steps_per_second": 2.508,
"step": 72
},
{
"epoch": 0.4828711256117455,
"grad_norm": 1.2775288820266724,
"learning_rate": 0.00016421070340036023,
"loss": 1.0124,
"step": 74
},
{
"epoch": 0.49592169657422513,
"grad_norm": 1.3676966428756714,
"learning_rate": 0.00015790303199779193,
"loss": 0.9968,
"step": 76
},
{
"epoch": 0.49592169657422513,
"eval_loss": 1.0086660385131836,
"eval_runtime": 24.6773,
"eval_samples_per_second": 19.816,
"eval_steps_per_second": 2.512,
"step": 76
},
{
"epoch": 0.5089722675367048,
"grad_norm": 1.2739876508712769,
"learning_rate": 0.00015158130929273695,
"loss": 0.9405,
"step": 78
},
{
"epoch": 0.5220228384991843,
"grad_norm": 1.3879481554031372,
"learning_rate": 0.00014525677507746615,
"loss": 0.9566,
"step": 80
},
{
"epoch": 0.5220228384991843,
"eval_loss": 1.0094032287597656,
"eval_runtime": 24.6744,
"eval_samples_per_second": 19.818,
"eval_steps_per_second": 2.513,
"step": 80
},
{
"epoch": 0.5350734094616639,
"grad_norm": 1.246418833732605,
"learning_rate": 0.00013894067414301314,
"loss": 1.0481,
"step": 82
},
{
"epoch": 0.5481239804241436,
"grad_norm": 1.3928742408752441,
"learning_rate": 0.0001326442362863458,
"loss": 1.0394,
"step": 84
},
{
"epoch": 0.5481239804241436,
"eval_loss": 1.008681297302246,
"eval_runtime": 24.6024,
"eval_samples_per_second": 19.876,
"eval_steps_per_second": 2.52,
"step": 84
},
{
"epoch": 0.5611745513866232,
"grad_norm": 1.2170292139053345,
"learning_rate": 0.00012637865634419735,
"loss": 0.9979,
"step": 86
},
{
"epoch": 0.5742251223491027,
"grad_norm": 1.3591171503067017,
"learning_rate": 0.00012015507428905507,
"loss": 0.9546,
"step": 88
},
{
"epoch": 0.5742251223491027,
"eval_loss": 1.0074015855789185,
"eval_runtime": 24.5002,
"eval_samples_per_second": 19.959,
"eval_steps_per_second": 2.531,
"step": 88
},
{
"epoch": 0.5872756933115824,
"grad_norm": 1.336329460144043,
"learning_rate": 0.00011398455542269575,
"loss": 0.9125,
"step": 90
},
{
"epoch": 0.600326264274062,
"grad_norm": 1.2378321886062622,
"learning_rate": 0.00010787807070248305,
"loss": 1.0347,
"step": 92
},
{
"epoch": 0.600326264274062,
"eval_loss": 1.0086424350738525,
"eval_runtime": 24.4221,
"eval_samples_per_second": 20.023,
"eval_steps_per_second": 2.539,
"step": 92
},
{
"epoch": 0.6133768352365416,
"grad_norm": 1.3458659648895264,
"learning_rate": 0.00010184647723540557,
"loss": 0.9567,
"step": 94
},
{
"epoch": 0.6264274061990212,
"grad_norm": 1.251621961593628,
"learning_rate": 9.590049897453668e-05,
"loss": 0.9639,
"step": 96
},
{
"epoch": 0.6264274061990212,
"eval_loss": 1.004166841506958,
"eval_runtime": 56.7542,
"eval_samples_per_second": 8.616,
"eval_steps_per_second": 1.092,
"step": 96
},
{
"epoch": 0.6394779771615008,
"grad_norm": 1.15924870967865,
"learning_rate": 9.005070765223768e-05,
"loss": 1.0447,
"step": 98
},
{
"epoch": 0.6525285481239804,
"grad_norm": 1.4097235202789307,
"learning_rate": 8.430750398400308e-05,
"loss": 1.0543,
"step": 100
},
{
"epoch": 0.6525285481239804,
"eval_loss": 1.002665638923645,
"eval_runtime": 55.6845,
"eval_samples_per_second": 8.782,
"eval_steps_per_second": 1.113,
"step": 100
},
{
"epoch": 0.6655791190864601,
"grad_norm": 1.3108314275741577,
"learning_rate": 7.868109917636821e-05,
"loss": 0.9645,
"step": 102
},
{
"epoch": 0.6786296900489397,
"grad_norm": 1.2921593189239502,
"learning_rate": 7.318149677175675e-05,
"loss": 0.9346,
"step": 104
},
{
"epoch": 0.6786296900489397,
"eval_loss": 1.003048300743103,
"eval_runtime": 57.4498,
"eval_samples_per_second": 8.512,
"eval_steps_per_second": 1.079,
"step": 104
},
{
"epoch": 0.6916802610114192,
"grad_norm": 1.2615606784820557,
"learning_rate": 6.781847486254697e-05,
"loss": 0.9565,
"step": 106
},
{
"epoch": 0.7047308319738989,
"grad_norm": 1.3441969156265259,
"learning_rate": 6.260156870598071e-05,
"loss": 0.9744,
"step": 108
},
{
"epoch": 0.7047308319738989,
"eval_loss": 1.0019466876983643,
"eval_runtime": 56.5017,
"eval_samples_per_second": 8.655,
"eval_steps_per_second": 1.097,
"step": 108
},
{
"epoch": 0.7177814029363785,
"grad_norm": 1.1984766721725464,
"learning_rate": 5.7540053770823644e-05,
"loss": 0.9558,
"step": 110
},
{
"epoch": 0.7308319738988581,
"grad_norm": 1.259084701538086,
"learning_rate": 5.264292924592073e-05,
"loss": 0.9546,
"step": 112
},
{
"epoch": 0.7308319738988581,
"eval_loss": 0.9984883069992065,
"eval_runtime": 56.938,
"eval_samples_per_second": 8.588,
"eval_steps_per_second": 1.089,
"step": 112
},
{
"epoch": 0.7438825448613376,
"grad_norm": 1.2619880437850952,
"learning_rate": 4.791890203996634e-05,
"loss": 0.9784,
"step": 114
},
{
"epoch": 0.7569331158238173,
"grad_norm": 1.1600760221481323,
"learning_rate": 4.3376371300938786e-05,
"loss": 0.9138,
"step": 116
},
{
"epoch": 0.7569331158238173,
"eval_loss": 0.9968593716621399,
"eval_runtime": 56.3466,
"eval_samples_per_second": 8.678,
"eval_steps_per_second": 1.1,
"step": 116
},
{
"epoch": 0.7699836867862969,
"grad_norm": 1.2669286727905273,
"learning_rate": 3.9023413482721426e-05,
"loss": 0.9714,
"step": 118
},
{
"epoch": 0.7830342577487766,
"grad_norm": 1.2815442085266113,
"learning_rate": 3.4867767985462507e-05,
"loss": 0.9026,
"step": 120
},
{
"epoch": 0.7830342577487766,
"eval_loss": 0.9961332082748413,
"eval_runtime": 56.6715,
"eval_samples_per_second": 8.629,
"eval_steps_per_second": 1.094,
"step": 120
},
{
"epoch": 0.7960848287112561,
"grad_norm": 1.2086176872253418,
"learning_rate": 3.09168233952042e-05,
"loss": 1.0291,
"step": 122
},
{
"epoch": 0.8091353996737357,
"grad_norm": 1.2728592157363892,
"learning_rate": 2.717760434724613e-05,
"loss": 0.9746,
"step": 124
},
{
"epoch": 0.8091353996737357,
"eval_loss": 0.9953013062477112,
"eval_runtime": 56.695,
"eval_samples_per_second": 8.625,
"eval_steps_per_second": 1.094,
"step": 124
},
{
"epoch": 0.8221859706362153,
"grad_norm": 1.163971185684204,
"learning_rate": 2.3656759036600187e-05,
"loss": 0.9733,
"step": 126
},
{
"epoch": 0.835236541598695,
"grad_norm": 1.2905343770980835,
"learning_rate": 2.0360547397742523e-05,
"loss": 0.9453,
"step": 128
},
{
"epoch": 0.835236541598695,
"eval_loss": 0.9950230717658997,
"eval_runtime": 57.4352,
"eval_samples_per_second": 8.514,
"eval_steps_per_second": 1.079,
"step": 128
},
{
"epoch": 0.8482871125611745,
"grad_norm": 1.2126384973526,
"learning_rate": 1.7294829974678338e-05,
"loss": 0.922,
"step": 130
},
{
"epoch": 0.8613376835236541,
"grad_norm": 1.3399946689605713,
"learning_rate": 1.4465057501108546e-05,
"loss": 1.0311,
"step": 132
},
{
"epoch": 0.8613376835236541,
"eval_loss": 0.9933781027793884,
"eval_runtime": 56.8077,
"eval_samples_per_second": 8.608,
"eval_steps_per_second": 1.091,
"step": 132
},
{
"epoch": 0.8743882544861338,
"grad_norm": 1.2741433382034302,
"learning_rate": 1.1876261209224314e-05,
"loss": 0.9365,
"step": 134
},
{
"epoch": 0.8874388254486134,
"grad_norm": 1.1750285625457764,
"learning_rate": 9.533043884359615e-06,
"loss": 0.971,
"step": 136
},
{
"epoch": 0.8874388254486134,
"eval_loss": 0.992695152759552,
"eval_runtime": 24.7252,
"eval_samples_per_second": 19.777,
"eval_steps_per_second": 2.508,
"step": 136
},
{
"epoch": 0.9004893964110929,
"grad_norm": 1.1639913320541382,
"learning_rate": 7.439571681407053e-06,
"loss": 1.0128,
"step": 138
},
{
"epoch": 0.9135399673735726,
"grad_norm": 1.2708672285079956,
"learning_rate": 5.59956671754635e-06,
"loss": 0.9957,
"step": 140
},
{
"epoch": 0.9135399673735726,
"eval_loss": 0.9919000864028931,
"eval_runtime": 24.7098,
"eval_samples_per_second": 19.79,
"eval_steps_per_second": 2.509,
"step": 140
},
{
"epoch": 0.9265905383360522,
"grad_norm": 1.3160277605056763,
"learning_rate": 4.016300454455945e-06,
"loss": 1.0054,
"step": 142
},
{
"epoch": 0.9396411092985318,
"grad_norm": 1.325445532798767,
"learning_rate": 2.692587881773478e-06,
"loss": 0.9502,
"step": 144
},
{
"epoch": 0.9396411092985318,
"eval_loss": 0.9917099475860596,
"eval_runtime": 24.7029,
"eval_samples_per_second": 19.795,
"eval_steps_per_second": 2.51,
"step": 144
},
{
"epoch": 0.9526916802610114,
"grad_norm": 1.1836706399917603,
"learning_rate": 1.6307825121469164e-06,
"loss": 0.991,
"step": 146
},
{
"epoch": 0.965742251223491,
"grad_norm": 1.2473053932189941,
"learning_rate": 8.327721967749779e-07,
"loss": 1.0133,
"step": 148
},
{
"epoch": 0.965742251223491,
"eval_loss": 0.9915127158164978,
"eval_runtime": 24.6188,
"eval_samples_per_second": 19.863,
"eval_steps_per_second": 2.518,
"step": 148
},
{
"epoch": 0.9787928221859706,
"grad_norm": 1.237483024597168,
"learning_rate": 2.9997576887660913e-07,
"loss": 0.9316,
"step": 150
},
{
"epoch": 0.9918433931484503,
"grad_norm": 1.279980182647705,
"learning_rate": 3.334052105728458e-08,
"loss": 0.9684,
"step": 152
},
{
"epoch": 0.9918433931484503,
"eval_loss": 0.9916173219680786,
"eval_runtime": 24.5672,
"eval_samples_per_second": 19.905,
"eval_steps_per_second": 2.524,
"step": 152
}
],
"logging_steps": 2,
"max_steps": 153,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 4,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.85963932651946e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}