|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1092, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.027472527472527472, |
|
"grad_norm": 12.728795051574707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9765, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.054945054945054944, |
|
"grad_norm": 2.706224203109741, |
|
"learning_rate": 5e-06, |
|
"loss": 0.839, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08241758241758242, |
|
"grad_norm": 1.059246301651001, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7854, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10989010989010989, |
|
"grad_norm": 0.8609521389007568, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7552, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13736263736263737, |
|
"grad_norm": 0.8304686546325684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7404, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16483516483516483, |
|
"grad_norm": 0.95517498254776, |
|
"learning_rate": 5e-06, |
|
"loss": 0.717, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.19230769230769232, |
|
"grad_norm": 0.8028889298439026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7069, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21978021978021978, |
|
"grad_norm": 1.287640929222107, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6991, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24725274725274726, |
|
"grad_norm": 2.1982975006103516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6911, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.27472527472527475, |
|
"grad_norm": 0.6827638149261475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6901, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3021978021978022, |
|
"grad_norm": 0.9066330194473267, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6826, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.32967032967032966, |
|
"grad_norm": 0.6553434133529663, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6863, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.9135424494743347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6745, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 0.7348975539207458, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6759, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.41208791208791207, |
|
"grad_norm": 0.9071689248085022, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6673, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.43956043956043955, |
|
"grad_norm": 0.7530332207679749, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6651, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.46703296703296704, |
|
"grad_norm": 0.7991921305656433, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6716, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4945054945054945, |
|
"grad_norm": 0.8702341318130493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6589, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.521978021978022, |
|
"grad_norm": 0.5698891878128052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6618, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5494505494505495, |
|
"grad_norm": 0.5574747323989868, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6629, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5769230769230769, |
|
"grad_norm": 0.6472902894020081, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6582, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6043956043956044, |
|
"grad_norm": 0.5712385177612305, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6551, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6318681318681318, |
|
"grad_norm": 0.492366224527359, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6485, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6593406593406593, |
|
"grad_norm": 0.7048478126525879, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6484, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6868131868131868, |
|
"grad_norm": 0.537118136882782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6488, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.49864494800567627, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6463, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7417582417582418, |
|
"grad_norm": 0.7438676953315735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6498, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.5052060484886169, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6464, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7967032967032966, |
|
"grad_norm": 0.548953115940094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6441, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8241758241758241, |
|
"grad_norm": 0.6289977431297302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6415, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8516483516483516, |
|
"grad_norm": 0.5599405765533447, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6419, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8791208791208791, |
|
"grad_norm": 0.6185189485549927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6371, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9065934065934066, |
|
"grad_norm": 0.5025250911712646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6388, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9340659340659341, |
|
"grad_norm": 0.596026599407196, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6371, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"grad_norm": 0.5150969624519348, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6323, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.989010989010989, |
|
"grad_norm": 0.5902632474899292, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6304, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6319476366043091, |
|
"eval_runtime": 33.2599, |
|
"eval_samples_per_second": 294.589, |
|
"eval_steps_per_second": 1.173, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.0164835164835164, |
|
"grad_norm": 0.684147298336029, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6119, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.043956043956044, |
|
"grad_norm": 0.6040191054344177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5939, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 0.6056487560272217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5955, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.098901098901099, |
|
"grad_norm": 0.6380589008331299, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5917, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1263736263736264, |
|
"grad_norm": 0.734009861946106, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5912, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 0.5856278538703918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5887, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1813186813186813, |
|
"grad_norm": 0.5818033218383789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5959, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.2087912087912087, |
|
"grad_norm": 0.5466902852058411, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5927, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2362637362637363, |
|
"grad_norm": 0.5269283056259155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5928, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2637362637362637, |
|
"grad_norm": 0.5852891802787781, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5888, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2912087912087913, |
|
"grad_norm": 0.5838508605957031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5868, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.3186813186813187, |
|
"grad_norm": 0.5828148722648621, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5881, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3461538461538463, |
|
"grad_norm": 0.5466414093971252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5844, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3736263736263736, |
|
"grad_norm": 0.6251242160797119, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5867, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.401098901098901, |
|
"grad_norm": 0.6761026382446289, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5878, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.8003695011138916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5858, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.456043956043956, |
|
"grad_norm": 0.6401771903038025, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5859, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4835164835164836, |
|
"grad_norm": 0.722094714641571, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5885, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.510989010989011, |
|
"grad_norm": 0.6249004006385803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5846, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.5867748856544495, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5841, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.565934065934066, |
|
"grad_norm": 0.6063013672828674, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5854, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5934065934065935, |
|
"grad_norm": 0.538633406162262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5812, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.620879120879121, |
|
"grad_norm": 0.6249384880065918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5802, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.6483516483516483, |
|
"grad_norm": 0.5973653197288513, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5848, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6758241758241759, |
|
"grad_norm": 0.5950356125831604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5807, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.7032967032967035, |
|
"grad_norm": 0.5238327980041504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5753, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.7307692307692308, |
|
"grad_norm": 0.5083211660385132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5801, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.7582417582417582, |
|
"grad_norm": 0.8316366076469421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5745, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 0.5875478386878967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.58, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.8131868131868132, |
|
"grad_norm": 0.5625612735748291, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5721, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8406593406593408, |
|
"grad_norm": 0.5859466791152954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5784, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.8681318681318682, |
|
"grad_norm": 0.5293060541152954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.575, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8956043956043955, |
|
"grad_norm": 0.516545832157135, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5771, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 0.5076315999031067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5759, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9505494505494505, |
|
"grad_norm": 0.46912071108818054, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5738, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.978021978021978, |
|
"grad_norm": 0.5850493907928467, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5776, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6049710512161255, |
|
"eval_runtime": 33.2692, |
|
"eval_samples_per_second": 294.506, |
|
"eval_steps_per_second": 1.172, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 2.0054945054945055, |
|
"grad_norm": 0.8460399508476257, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5647, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.032967032967033, |
|
"grad_norm": 0.5859711170196533, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5343, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.0604395604395602, |
|
"grad_norm": 0.5478792190551758, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5346, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.087912087912088, |
|
"grad_norm": 0.5353866815567017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5386, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.1153846153846154, |
|
"grad_norm": 0.589070737361908, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5364, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.7154620289802551, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5357, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.17032967032967, |
|
"grad_norm": 0.5796002149581909, |
|
"learning_rate": 5e-06, |
|
"loss": 0.532, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.197802197802198, |
|
"grad_norm": 0.5556730031967163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5389, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.2252747252747254, |
|
"grad_norm": 0.5793793201446533, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5323, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.2527472527472527, |
|
"grad_norm": 0.6171457767486572, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5325, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.28021978021978, |
|
"grad_norm": 0.563108503818512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5305, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 0.6023669242858887, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5336, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.3351648351648353, |
|
"grad_norm": 0.6838552355766296, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5299, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.3626373626373627, |
|
"grad_norm": 0.5465499758720398, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5352, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.39010989010989, |
|
"grad_norm": 0.6073447465896606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.534, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.4175824175824174, |
|
"grad_norm": 0.5182047486305237, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5317, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.4450549450549453, |
|
"grad_norm": 0.543487548828125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5381, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.4725274725274726, |
|
"grad_norm": 0.5238776803016663, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5381, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.5654677152633667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5316, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.5274725274725274, |
|
"grad_norm": 0.6107837557792664, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5368, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.5549450549450547, |
|
"grad_norm": 0.5588423609733582, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5402, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.5824175824175826, |
|
"grad_norm": 0.5652673840522766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5382, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.60989010989011, |
|
"grad_norm": 0.5125435590744019, |
|
"learning_rate": 5e-06, |
|
"loss": 0.539, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.6373626373626373, |
|
"grad_norm": 0.5421128273010254, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5335, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.6648351648351647, |
|
"grad_norm": 0.5882076621055603, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5341, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 0.6641466021537781, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5301, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.71978021978022, |
|
"grad_norm": 0.5694056153297424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.537, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.7472527472527473, |
|
"grad_norm": 0.48833751678466797, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5333, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.7747252747252746, |
|
"grad_norm": 0.5480605959892273, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5352, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.802197802197802, |
|
"grad_norm": 0.6873406767845154, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5389, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.82967032967033, |
|
"grad_norm": 0.5708329081535339, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5346, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.5935104489326477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5331, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.8846153846153846, |
|
"grad_norm": 0.5802971124649048, |
|
"learning_rate": 5e-06, |
|
"loss": 0.53, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.912087912087912, |
|
"grad_norm": 0.5741780400276184, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5291, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.9395604395604398, |
|
"grad_norm": 0.578640878200531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5326, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.967032967032967, |
|
"grad_norm": 0.5405354499816895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5314, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.9945054945054945, |
|
"grad_norm": 0.5508515238761902, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5347, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.5989904403686523, |
|
"eval_runtime": 33.1874, |
|
"eval_samples_per_second": 295.232, |
|
"eval_steps_per_second": 1.175, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1092, |
|
"total_flos": 5.156086030431106e+19, |
|
"train_loss": 0.6005454126731816, |
|
"train_runtime": 7773.4306, |
|
"train_samples_per_second": 71.841, |
|
"train_steps_per_second": 0.14 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1092, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.156086030431106e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|