shizhediao2
/

mamba1b-4nodes-lr5e-5-ep1-bsz1024-packing-nemo-sft1-new

Safetensors

jamba

Model card Files Files and versions Community

shizhediao2 commited on Sep 30, 2024

Commit

27e5821

verified ·

1 Parent(s): e64f666

Upload trainer_state.json with huggingface_hub

Browse files

Files changed (1) hide show

trainer_state.json +630 -0

trainer_state.json ADDED Viewed

	@@ -0,0 +1,630 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 200,
+  "global_step": 1505,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013289036544850499,
+      "grad_norm": 0.10525072365999222,
+      "learning_rate": 4.9978216198586135e-05,
+      "loss": 0.6155,
+      "step": 20
+    },
+    {
+      "epoch": 0.026578073089700997,
+      "grad_norm": 0.08554615080356598,
+      "learning_rate": 4.991290275706486e-05,
+      "loss": 0.5694,
+      "step": 40
+    },
+    {
+      "epoch": 0.03986710963455149,
+      "grad_norm": 0.08361516892910004,
+      "learning_rate": 4.980417349743936e-05,
+      "loss": 0.557,
+      "step": 60
+    },
+    {
+      "epoch": 0.053156146179401995,
+      "grad_norm": 0.08680060505867004,
+      "learning_rate": 4.9652217902637596e-05,
+      "loss": 0.548,
+      "step": 80
+    },
+    {
+      "epoch": 0.0664451827242525,
+      "grad_norm": 0.08960291743278503,
+      "learning_rate": 4.945730078629964e-05,
+      "loss": 0.5427,
+      "step": 100
+    },
+    {
+      "epoch": 0.07973421926910298,
+      "grad_norm": 0.09262242168188095,
+      "learning_rate": 4.921976183128585e-05,
+      "loss": 0.5384,
+      "step": 120
+    },
+    {
+      "epoch": 0.09302325581395349,
+      "grad_norm": 0.08780515193939209,
+      "learning_rate": 4.894001499771015e-05,
+      "loss": 0.5362,
+      "step": 140
+    },
+    {
+      "epoch": 0.10631229235880399,
+      "grad_norm": 0.09249912202358246,
+      "learning_rate": 4.861854780153004e-05,
+      "loss": 0.5324,
+      "step": 160
+    },
+    {
+      "epoch": 0.11960132890365449,
+      "grad_norm": 0.09562400728464127,
+      "learning_rate": 4.825592046495054e-05,
+      "loss": 0.5311,
+      "step": 180
+    },
+    {
+      "epoch": 0.132890365448505,
+      "grad_norm": 0.09372778236865997,
+      "learning_rate": 4.785276494012263e-05,
+      "loss": 0.5278,
+      "step": 200
+    },
+    {
+      "epoch": 0.132890365448505,
+      "eval_accuracy": 0.19452303794312395,
+      "eval_loss": 0.5592088103294373,
+      "eval_runtime": 19.5284,
+      "eval_samples_per_second": 93.914,
+      "eval_steps_per_second": 0.41,
+      "step": 200
+    },
+    {
+      "epoch": 0.1461794019933555,
+      "grad_norm": 0.08762918412685394,
+      "learning_rate": 4.740978380783765e-05,
+      "loss": 0.5253,
+      "step": 220
+    },
+    {
+      "epoch": 0.15946843853820597,
+      "grad_norm": 0.08518578112125397,
+      "learning_rate": 4.6927749053136866e-05,
+      "loss": 0.5192,
+      "step": 240
+    },
+    {
+      "epoch": 0.17275747508305647,
+      "grad_norm": 0.09664598107337952,
+      "learning_rate": 4.640750071996995e-05,
+      "loss": 0.5217,
+      "step": 260
+    },
+    {
+      "epoch": 0.18604651162790697,
+      "grad_norm": 0.08245342969894409,
+      "learning_rate": 4.584994544724695e-05,
+      "loss": 0.5172,
+      "step": 280
+    },
+    {
+      "epoch": 0.19933554817275748,
+      "grad_norm": 0.08551981300115585,
+      "learning_rate": 4.5256054888834934e-05,
+      "loss": 0.5152,
+      "step": 300
+    },
+    {
+      "epoch": 0.21262458471760798,
+      "grad_norm": 0.09647104889154434,
+      "learning_rate": 4.4626864020252774e-05,
+      "loss": 0.5139,
+      "step": 320
+    },
+    {
+      "epoch": 0.22591362126245848,
+      "grad_norm": 0.09810427576303482,
+      "learning_rate": 4.3963469335015085e-05,
+      "loss": 0.5129,
+      "step": 340
+    },
+    {
+      "epoch": 0.23920265780730898,
+      "grad_norm": 0.08342389762401581,
+      "learning_rate": 4.326702693376844e-05,
+      "loss": 0.5119,
+      "step": 360
+    },
+    {
+      "epoch": 0.25249169435215946,
+      "grad_norm": 0.08738644421100616,
+      "learning_rate": 4.2538750509550054e-05,
+      "loss": 0.511,
+      "step": 380
+    },
+    {
+      "epoch": 0.26578073089701,
+      "grad_norm": 0.08475251495838165,
+      "learning_rate": 4.177990923267986e-05,
+      "loss": 0.5117,
+      "step": 400
+    },
+    {
+      "epoch": 0.26578073089701,
+      "eval_accuracy": 0.1953402564276045,
+      "eval_loss": 0.5438870787620544,
+      "eval_runtime": 15.5302,
+      "eval_samples_per_second": 118.093,
+      "eval_steps_per_second": 0.515,
+      "step": 400
+    },
+    {
+      "epoch": 0.27906976744186046,
+      "grad_norm": 0.07873477786779404,
+      "learning_rate": 4.099182553897229e-05,
+      "loss": 0.5084,
+      "step": 420
+    },
+    {
+      "epoch": 0.292358803986711,
+      "grad_norm": 0.09158772230148315,
+      "learning_rate": 4.017587282512181e-05,
+      "loss": 0.5065,
+      "step": 440
+    },
+    {
+      "epoch": 0.30564784053156147,
+      "grad_norm": 0.07729614526033401,
+      "learning_rate": 3.933347305527898e-05,
+      "loss": 0.5047,
+      "step": 460
+    },
+    {
+      "epoch": 0.31893687707641194,
+      "grad_norm": 0.08530613034963608,
+      "learning_rate": 3.846609428298757e-05,
+      "loss": 0.5049,
+      "step": 480
+    },
+    {
+      "epoch": 0.33222591362126247,
+      "grad_norm": 0.07760792225599289,
+      "learning_rate": 3.7575248092801686e-05,
+      "loss": 0.5035,
+      "step": 500
+    },
+    {
+      "epoch": 0.34551495016611294,
+      "grad_norm": 0.08521712571382523,
+      "learning_rate": 3.66624869660411e-05,
+      "loss": 0.5042,
+      "step": 520
+    },
+    {
+      "epoch": 0.3588039867109635,
+      "grad_norm": 0.08439727872610092,
+      "learning_rate": 3.572940157527572e-05,
+      "loss": 0.5021,
+      "step": 540
+    },
+    {
+      "epoch": 0.37209302325581395,
+      "grad_norm": 0.09042590111494064,
+      "learning_rate": 3.47776180122539e-05,
+      "loss": 0.5019,
+      "step": 560
+    },
+    {
+      "epoch": 0.3853820598006645,
+      "grad_norm": 0.08219762146472931,
+      "learning_rate": 3.3808794954105716e-05,
+      "loss": 0.501,
+      "step": 580
+    },
+    {
+      "epoch": 0.39867109634551495,
+      "grad_norm": 0.08426713198423386,
+      "learning_rate": 3.282462077275947e-05,
+      "loss": 0.5013,
+      "step": 600
+    },
+    {
+      "epoch": 0.39867109634551495,
+      "eval_accuracy": 0.19588631180347973,
+      "eval_loss": 0.5341373682022095,
+      "eval_runtime": 16.1072,
+      "eval_samples_per_second": 113.862,
+      "eval_steps_per_second": 0.497,
+      "step": 600
+    },
+    {
+      "epoch": 0.4119601328903654,
+      "grad_norm": 0.08020314574241638,
+      "learning_rate": 3.1826810592609036e-05,
+      "loss": 0.4968,
+      "step": 620
+    },
+    {
+      "epoch": 0.42524916943521596,
+      "grad_norm": 0.07975760847330093,
+      "learning_rate": 3.081710330155942e-05,
+      "loss": 0.4997,
+      "step": 640
+    },
+    {
+      "epoch": 0.43853820598006643,
+      "grad_norm": 0.08056964725255966,
+      "learning_rate": 2.979725852065981e-05,
+      "loss": 0.4968,
+      "step": 660
+    },
+    {
+      "epoch": 0.45182724252491696,
+      "grad_norm": 0.08022565394639969,
+      "learning_rate": 2.876905353760459e-05,
+      "loss": 0.4976,
+      "step": 680
+    },
+    {
+      "epoch": 0.46511627906976744,
+      "grad_norm": 0.08131925761699677,
+      "learning_rate": 2.7734280209446865e-05,
+      "loss": 0.4973,
+      "step": 700
+    },
+    {
+      "epoch": 0.47840531561461797,
+      "grad_norm": 0.07562076300382614,
+      "learning_rate": 2.6694741839921732e-05,
+      "loss": 0.4956,
+      "step": 720
+    },
+    {
+      "epoch": 0.49169435215946844,
+      "grad_norm": 0.07877329736948013,
+      "learning_rate": 2.5652250036821523e-05,
+      "loss": 0.4966,
+      "step": 740
+    },
+    {
+      "epoch": 0.5049833887043189,
+      "grad_norm": 0.08014395087957382,
+      "learning_rate": 2.4608621554899362e-05,
+      "loss": 0.4934,
+      "step": 760
+    },
+    {
+      "epoch": 0.5182724252491694,
+      "grad_norm": 0.07770328223705292,
+      "learning_rate": 2.356567512980326e-05,
+      "loss": 0.4934,
+      "step": 780
+    },
+    {
+      "epoch": 0.53156146179402,
+      "grad_norm": 0.07732851803302765,
+      "learning_rate": 2.252522830855798e-05,
+      "loss": 0.4951,
+      "step": 800
+    },
+    {
+      "epoch": 0.53156146179402,
+      "eval_accuracy": 0.19623978277118043,
+      "eval_loss": 0.5274041295051575,
+      "eval_runtime": 16.4552,
+      "eval_samples_per_second": 111.454,
+      "eval_steps_per_second": 0.486,
+      "step": 800
+    },
+    {
+      "epoch": 0.5448504983388704,
+      "grad_norm": 0.07608461380004883,
+      "learning_rate": 2.1489094282118395e-05,
+      "loss": 0.4896,
+      "step": 820
+    },
+    {
+      "epoch": 0.5581395348837209,
+      "grad_norm": 0.07657533138990402,
+      "learning_rate": 2.0459078725514092e-05,
+      "loss": 0.4918,
+      "step": 840
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.07983728498220444,
+      "learning_rate": 1.9436976651092144e-05,
+      "loss": 0.4927,
+      "step": 860
+    },
+    {
+      "epoch": 0.584717607973422,
+      "grad_norm": 0.07355430722236633,
+      "learning_rate": 1.8424569280341653e-05,
+      "loss": 0.493,
+      "step": 880
+    },
+    {
+      "epoch": 0.5980066445182725,
+      "grad_norm": 0.08014149218797684,
+      "learning_rate": 1.7423620939751788e-05,
+      "loss": 0.4922,
+      "step": 900
+    },
+    {
+      "epoch": 0.6112956810631229,
+      "grad_norm": 0.07500924915075302,
+      "learning_rate": 1.6435875986112685e-05,
+      "loss": 0.491,
+      "step": 920
+    },
+    {
+      "epoch": 0.6245847176079734,
+      "grad_norm": 0.07356715947389603,
+      "learning_rate": 1.546305576661776e-05,
+      "loss": 0.4909,
+      "step": 940
+    },
+    {
+      "epoch": 0.6378737541528239,
+      "grad_norm": 0.07140863686800003,
+      "learning_rate": 1.4506855619064846e-05,
+      "loss": 0.489,
+      "step": 960
+    },
+    {
+      "epoch": 0.6511627906976745,
+      "grad_norm": 0.07692987471818924,
+      "learning_rate": 1.3568941917384036e-05,
+      "loss": 0.4902,
+      "step": 980
+    },
+    {
+      "epoch": 0.6644518272425249,
+      "grad_norm": 0.07356040179729462,
+      "learning_rate": 1.2650949167640997e-05,
+      "loss": 0.4894,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6644518272425249,
+      "eval_accuracy": 0.19652373156663552,
+      "eval_loss": 0.5229406952857971,
+      "eval_runtime": 15.6791,
+      "eval_samples_per_second": 116.971,
+      "eval_steps_per_second": 0.51,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6777408637873754,
+      "grad_norm": 0.0691773071885109,
+      "learning_rate": 1.1754477159576499e-05,
+      "loss": 0.4869,
+      "step": 1020
+    },
+    {
+      "epoch": 0.6910299003322259,
+      "grad_norm": 0.07505939155817032,
+      "learning_rate": 1.088108817864629e-05,
+      "loss": 0.4865,
+      "step": 1040
+    },
+    {
+      "epoch": 0.7043189368770764,
+      "grad_norm": 0.06973451375961304,
+      "learning_rate": 1.003230428341979e-05,
+      "loss": 0.4888,
+      "step": 1060
+    },
+    {
+      "epoch": 0.717607973421927,
+      "grad_norm": 0.07225219160318375,
+      "learning_rate": 9.209604653082326e-06,
+      "loss": 0.4858,
+      "step": 1080
+    },
+    {
+      "epoch": 0.7308970099667774,
+      "grad_norm": 0.07558443397283554,
+      "learning_rate": 8.414423009663563e-06,
+      "loss": 0.4891,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7441860465116279,
+      "grad_norm": 0.0698658898472786,
+      "learning_rate": 7.648145119484152e-06,
+      "loss": 0.4871,
+      "step": 1120
+    },
+    {
+      "epoch": 0.7574750830564784,
+      "grad_norm": 0.06963298469781876,
+      "learning_rate": 6.912106378175098e-06,
+      "loss": 0.4884,
+      "step": 1140
+    },
+    {
+      "epoch": 0.770764119601329,
+      "grad_norm": 0.0692787617444992,
+      "learning_rate": 6.207589483478266e-06,
+      "loss": 0.4877,
+      "step": 1160
+    },
+    {
+      "epoch": 0.7840531561461794,
+      "grad_norm": 0.07016126066446304,
+      "learning_rate": 5.53582219988382e-06,
+      "loss": 0.4856,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7973421926910299,
+      "grad_norm": 0.06945677101612091,
+      "learning_rate": 4.897975218999926e-06,
+      "loss": 0.4868,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7973421926910299,
+      "eval_accuracy": 0.19665158843513314,
+      "eval_loss": 0.5205041170120239,
+      "eval_runtime": 14.8321,
+      "eval_samples_per_second": 123.651,
+      "eval_steps_per_second": 0.539,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8106312292358804,
+      "grad_norm": 0.07045505195856094,
+      "learning_rate": 4.295160119383712e-06,
+      "loss": 0.4859,
+      "step": 1220
+    },
+    {
+      "epoch": 0.8239202657807309,
+      "grad_norm": 0.06839559227228165,
+      "learning_rate": 3.728427429388709e-06,
+      "loss": 0.4863,
+      "step": 1240
+    },
+    {
+      "epoch": 0.8372093023255814,
+      "grad_norm": 0.06684821844100952,
+      "learning_rate": 3.198764796404807e-06,
+      "loss": 0.4856,
+      "step": 1260
+    },
+    {
+      "epoch": 0.8504983388704319,
+      "grad_norm": 0.06731660664081573,
+      "learning_rate": 2.707095265681081e-06,
+      "loss": 0.4854,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8637873754152824,
+      "grad_norm": 0.06780705600976944,
+      "learning_rate": 2.254275671731007e-06,
+      "loss": 0.4868,
+      "step": 1300
+    },
+    {
+      "epoch": 0.8770764119601329,
+      "grad_norm": 0.06815515458583832,
+      "learning_rate": 1.8410951451234533e-06,
+      "loss": 0.4854,
+      "step": 1320
+    },
+    {
+      "epoch": 0.8903654485049833,
+      "grad_norm": 0.0670180469751358,
+      "learning_rate": 1.4682737372615967e-06,
+      "loss": 0.485,
+      "step": 1340
+    },
+    {
+      "epoch": 0.9036544850498339,
+      "grad_norm": 0.06649608910083771,
+      "learning_rate": 1.1364611655463736e-06,
+      "loss": 0.4867,
+      "step": 1360
+    },
+    {
+      "epoch": 0.9169435215946844,
+      "grad_norm": 0.0674930214881897,
+      "learning_rate": 8.462356811112987e-07,
+      "loss": 0.4865,
+      "step": 1380
+    },
+    {
+      "epoch": 0.9302325581395349,
+      "grad_norm": 0.06808231770992279,
+      "learning_rate": 5.981030611018234e-07,
+      "loss": 0.4864,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9302325581395349,
+      "eval_accuracy": 0.19667556159797644,
+      "eval_loss": 0.519675612449646,
+      "eval_runtime": 14.9507,
+      "eval_samples_per_second": 122.67,
+      "eval_steps_per_second": 0.535,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9435215946843853,
+      "grad_norm": 0.06696037203073502,
+      "learning_rate": 3.9249572725543196e-07,
+      "loss": 0.4852,
+      "step": 1420
+    },
+    {
+      "epoch": 0.9568106312292359,
+      "grad_norm": 0.06675516068935394,
+      "learning_rate": 2.297719923185032e-07,
+      "loss": 0.4875,
+      "step": 1440
+    },
+    {
+      "epoch": 0.9700996677740864,
+      "grad_norm": 0.06678403913974762,
+      "learning_rate": 1.1021543561322012e-07,
+      "loss": 0.4852,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9833887043189369,
+      "grad_norm": 0.0660882443189621,
+      "learning_rate": 3.403440884269526e-08,
+      "loss": 0.4848,
+      "step": 1480
+    },
+    {
+      "epoch": 0.9966777408637874,
+      "grad_norm": 0.06698651611804962,
+      "learning_rate": 1.3616729956228425e-09,
+      "loss": 0.4847,
+      "step": 1500
+    },
+    {
+      "epoch": 1.0,
+      "step": 1505,
+      "total_flos": 2.786803439690685e+19,
+      "train_loss": 0.0,
+      "train_runtime": 4.5361,
+      "train_samples_per_second": 339673.082,
+      "train_steps_per_second": 331.781
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 1505,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.786803439690685e+19,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}